Add a work-in-progress backend for x86_64 using the new instruction selection;

Most of the work is credited to Julian Seward. Co-authored-by: Julian Seward <jseward@acm.org> Co-authored-by: Chris Fallin <cfallin@mozilla.com>
2020-04-27 16:19:08 +02:00
parent 6bee767129
commit fa54422854
12 changed files with 5690 additions and 6 deletions
--- a/cranelift/codegen/Cargo.toml
+++ b/cranelift/codegen/Cargo.toml
@@ -58,10 +58,12 @@ x86 = []
 arm32 = []
 arm64 = []
 riscv = []
 x64 = [] # New work-in-progress codegen backend for x86_64 based on the new isel.
 # Option to enable all architectures.
 all-arch = [
    "x86",
    "x64",
    "arm32",
    "arm64",
    "riscv"
--- a/cranelift/codegen/meta/src/isa/x86/settings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/settings.rs
@@ -3,6 +3,12 @@ use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder};
 pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
    let mut settings = SettingGroupBuilder::new("x86");
    settings.add_bool(
        "use_new_backend",
        "Whether to use the new codegen backend using the new isel",
        false,
    );
    // CPUID.01H:ECX
    let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false);
    let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false);
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -77,6 +77,9 @@ mod riscv;
 #[cfg(feature = "x86")]
 mod x86;
 #[cfg(feature = "x64")]
 mod x64;
 #[cfg(feature = "arm32")]
 mod arm32;
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -0,0 +1,457 @@
 //! Implementation of the standard x64 ABI.
 use alloc::vec::Vec;
 use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable};
 use crate::ir::{self, types, types::*, ArgumentExtension, StackSlot, Type};
 use crate::isa::{self, x64::inst::*};
 use crate::machinst::*;
 use crate::settings;
 use args::*;
 #[derive(Clone, Debug)]
 enum ABIArg {
    Reg(RealReg),
    _Stack,
 }
 #[derive(Clone, Debug)]
 enum ABIRet {
    Reg(RealReg),
    _Stack,
 }
 pub(crate) struct X64ABIBody {
    args: Vec<ABIArg>,
    rets: Vec<ABIRet>,
    /// Offsets to each stack slot.
    _stack_slots: Vec<usize>,
    /// Total stack size of all the stack slots.
    stack_slots_size: usize,
    /// Clobbered registers, as indicated by regalloc.
    clobbered: Set<Writable<RealReg>>,
    /// Total number of spill slots, as indicated by regalloc.
    num_spill_slots: Option<usize>,
    /// Calculated while creating the prologue, and used when creating the epilogue. Amount by
    /// which RSP is adjusted downwards to allocate the spill area.
    frame_size_bytes: Option<usize>,
    call_conv: isa::CallConv,
    /// The settings controlling this function's compilation.
    flags: settings::Flags,
 }
 fn in_int_reg(ty: types::Type) -> bool {
    match ty {
        types::I8
        | types::I16
        | types::I32
        | types::I64
        | types::B1
        | types::B8
        | types::B16
        | types::B32
        | types::B64 => true,
        _ => false,
    }
 }
 fn get_intreg_for_arg_systemv(idx: usize) -> Option<Reg> {
    match idx {
        0 => Some(regs::rdi()),
        1 => Some(regs::rsi()),
        2 => Some(regs::rdx()),
        3 => Some(regs::rcx()),
        4 => Some(regs::r8()),
        5 => Some(regs::r9()),
        _ => None,
    }
 }
 fn get_intreg_for_retval_systemv(idx: usize) -> Option<Reg> {
    match idx {
        0 => Some(regs::rax()),
        1 => Some(regs::rdx()),
        _ => None,
    }
 }
 fn is_callee_save_systemv(r: RealReg) -> bool {
    use regs::*;
    match r.get_class() {
        RegClass::I64 => match r.get_hw_encoding() as u8 {
            ENC_RBX | ENC_RBP | ENC_R12 | ENC_R13 | ENC_R14 | ENC_R15 => true,
            _ => false,
        },
        _ => unimplemented!(),
    }
 }
 fn get_callee_saves(regs: Vec<Writable<RealReg>>) -> Vec<Writable<RealReg>> {
    regs.into_iter()
        .filter(|r| is_callee_save_systemv(r.to_reg()))
        .collect()
 }
 impl X64ABIBody {
    /// Create a new body ABI instance.
    pub(crate) fn new(f: &ir::Function, flags: settings::Flags) -> Self {
        // Compute args and retvals from signature.
        let mut args = vec![];
        let mut next_int_arg = 0;
        for param in &f.signature.params {
            match param.purpose {
                ir::ArgumentPurpose::VMContext if f.signature.call_conv.extends_baldrdash() => {
                    // `VMContext` is `r14` in Baldrdash.
                    args.push(ABIArg::Reg(regs::r14().to_real_reg()));
                }
                ir::ArgumentPurpose::Normal | ir::ArgumentPurpose::VMContext => {
                    if in_int_reg(param.value_type) {
                        if let Some(reg) = get_intreg_for_arg_systemv(next_int_arg) {
                            args.push(ABIArg::Reg(reg.to_real_reg()));
                        } else {
                            unimplemented!("passing arg on the stack");
                        }
                        next_int_arg += 1;
                    } else {
                        unimplemented!("non int normal register")
                    }
                }
                _ => unimplemented!("other parameter purposes"),
            }
        }
        let mut rets = vec![];
        let mut next_int_retval = 0;
        for ret in &f.signature.returns {
            match ret.purpose {
                ir::ArgumentPurpose::Normal => {
                    if in_int_reg(ret.value_type) {
                        if let Some(reg) = get_intreg_for_retval_systemv(next_int_retval) {
                            rets.push(ABIRet::Reg(reg.to_real_reg()));
                        } else {
                            unimplemented!("passing return on the stack");
                        }
                        next_int_retval += 1;
                    } else {
                        unimplemented!("returning non integer normal value");
                    }
                }
                _ => {
                    unimplemented!("non normal argument purpose");
                }
            }
        }
        // Compute stackslot locations and total stackslot size.
        let mut stack_offset: usize = 0;
        let mut _stack_slots = vec![];
        for (stackslot, data) in f.stack_slots.iter() {
            let off = stack_offset;
            stack_offset += data.size as usize;
            // 8-bit align.
            stack_offset = (stack_offset + 7) & !7usize;
            debug_assert_eq!(stackslot.as_u32() as usize, _stack_slots.len());
            _stack_slots.push(off);
        }
        Self {
            args,
            rets,
            _stack_slots,
            stack_slots_size: stack_offset,
            clobbered: Set::empty(),
            num_spill_slots: None,
            frame_size_bytes: None,
            call_conv: f.signature.call_conv.clone(),
            flags,
        }
    }
 }
 impl ABIBody for X64ABIBody {
    type I = Inst;
    fn flags(&self) -> &settings::Flags {
        &self.flags
    }
    fn num_args(&self) -> usize {
        unimplemented!()
    }
    fn num_retvals(&self) -> usize {
        unimplemented!()
    }
    fn num_stackslots(&self) -> usize {
        unimplemented!()
    }
    fn liveins(&self) -> Set<RealReg> {
        let mut set: Set<RealReg> = Set::empty();
        for arg in &self.args {
            if let &ABIArg::Reg(r) = arg {
                set.insert(r);
            }
        }
        set
    }
    fn liveouts(&self) -> Set<RealReg> {
        let mut set: Set<RealReg> = Set::empty();
        for ret in &self.rets {
            if let &ABIRet::Reg(r) = ret {
                set.insert(r);
            }
        }
        set
    }
    fn gen_copy_arg_to_reg(&self, idx: usize, to_reg: Writable<Reg>) -> Inst {
        match &self.args[idx] {
            ABIArg::Reg(from_reg) => {
                if from_reg.get_class() == RegClass::I32 || from_reg.get_class() == RegClass::I64 {
                    // TODO do we need a sign extension if it's I32?
                    return Inst::mov_r_r(/*is64=*/ true, from_reg.to_reg(), to_reg);
                }
                unimplemented!("moving from non-int arg to vreg");
            }
            ABIArg::_Stack => unimplemented!("moving from stack arg to vreg"),
        }
    }
    fn gen_copy_reg_to_retval(
        &self,
        idx: usize,
        from_reg: Writable<Reg>,
        ext: ArgumentExtension,
    ) -> Vec<Inst> {
        match ext {
            ArgumentExtension::None => {}
            _ => unimplemented!(
                "unimplemented argument extension {:?} is required for baldrdash",
                ext
            ),
        };
        let mut ret = Vec::new();
        match &self.rets[idx] {
            ABIRet::Reg(to_reg) => {
                if to_reg.get_class() == RegClass::I32 || to_reg.get_class() == RegClass::I64 {
                    ret.push(Inst::mov_r_r(
                        /*is64=*/ true,
                        from_reg.to_reg(),
                        Writable::<Reg>::from_reg(to_reg.to_reg()),
                    ))
                } else {
                    unimplemented!("moving from vreg to non-int return value");
                }
            }
            ABIRet::_Stack => {
                unimplemented!("moving from vreg to stack return value");
            }
        }
        ret
    }
    fn gen_ret(&self) -> Inst {
        Inst::ret()
    }
    fn gen_epilogue_placeholder(&self) -> Inst {
        Inst::epilogue_placeholder()
    }
    fn set_num_spillslots(&mut self, slots: usize) {
        self.num_spill_slots = Some(slots);
    }
    fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>) {
        self.clobbered = clobbered;
    }
    fn stackslot_addr(&self, _slot: StackSlot, _offset: u32, _into_reg: Writable<Reg>) -> Inst {
        unimplemented!()
    }
    fn load_stackslot(
        &self,
        _slot: StackSlot,
        _offset: u32,
        _ty: Type,
        _into_reg: Writable<Reg>,
    ) -> Inst {
        unimplemented!("load_stackslot")
    }
    fn store_stackslot(&self, _slot: StackSlot, _offset: u32, _ty: Type, _from_reg: Reg) -> Inst {
        unimplemented!("store_stackslot")
    }
    fn load_spillslot(&self, _slot: SpillSlot, _ty: Type, _into_reg: Writable<Reg>) -> Inst {
        unimplemented!("load_spillslot")
    }
    fn store_spillslot(&self, _slot: SpillSlot, _ty: Type, _from_reg: Reg) -> Inst {
        unimplemented!("store_spillslot")
    }
    fn gen_prologue(&mut self) -> Vec<Inst> {
        let r_rsp = regs::rsp();
        let mut insts = vec![];
        // Baldrdash generates its own prologue sequence, so we don't have to.
        if !self.call_conv.extends_baldrdash() {
            let r_rbp = regs::rbp();
            let w_rbp = Writable::<Reg>::from_reg(r_rbp);
            // The "traditional" pre-preamble
            // RSP before the call will be 0 % 16.  So here, it is 8 % 16.
            insts.push(Inst::push64(RMI::reg(r_rbp)));
            // RSP is now 0 % 16
            insts.push(Inst::mov_r_r(true, r_rsp, w_rbp));
        }
        // Save callee saved registers that we trash. Keep track of how much space we've used, so
        // as to know what we have to do to get the base of the spill area 0 % 16.
        let mut callee_saved_used = 0;
        let clobbered = get_callee_saves(self.clobbered.to_vec());
        for reg in clobbered {
            let r_reg = reg.to_reg();
            match r_reg.get_class() {
                RegClass::I64 => {
                    insts.push(Inst::push64(RMI::reg(r_reg.to_reg())));
                    callee_saved_used += 8;
                }
                _ => unimplemented!(),
            }
        }
        let mut total_stacksize = self.stack_slots_size + 8 * self.num_spill_slots.unwrap();
        if self.call_conv.extends_baldrdash() {
            // Baldrdash expects the stack to take at least the number of words set in
            // baldrdash_prologue_words; count them here.
            debug_assert!(
                !self.flags.enable_probestack(),
                "baldrdash does not expect cranelift to emit stack probes"
            );
            total_stacksize += self.flags.baldrdash_prologue_words() as usize * 8;
        }
        debug_assert!(callee_saved_used % 16 == 0 || callee_saved_used % 16 == 8);
        let frame_size = total_stacksize + callee_saved_used % 16;
        // Now make sure the frame stack is aligned, so RSP == 0 % 16 in the function's body.
        let frame_size = (frame_size + 15) & !15;
        if frame_size > 0x7FFF_FFFF {
            unimplemented!("gen_prologue(x86): total_stacksize >= 2G");
        }
        if !self.call_conv.extends_baldrdash() {
            // Explicitly allocate the frame.
            let w_rsp = Writable::<Reg>::from_reg(r_rsp);
            if frame_size > 0 {
                insts.push(Inst::alu_rmi_r(
                    true,
                    RMI_R_Op::Sub,
                    RMI::imm(frame_size as u32),
                    w_rsp,
                ));
            }
        }
        // Stash this value.  We'll need it for the epilogue.
        debug_assert!(self.frame_size_bytes.is_none());
        self.frame_size_bytes = Some(frame_size);
        insts
    }
    fn gen_epilogue(&self) -> Vec<Inst> {
        let mut insts = vec![];
        // Undo what we did in the prologue.
        // Clear the spill area and the 16-alignment padding below it.
        if !self.call_conv.extends_baldrdash() {
            let frame_size = self.frame_size_bytes.unwrap();
            if frame_size > 0 {
                let r_rsp = regs::rsp();
                let w_rsp = Writable::<Reg>::from_reg(r_rsp);
                insts.push(Inst::alu_rmi_r(
                    true,
                    RMI_R_Op::Add,
                    RMI::imm(frame_size as u32),
                    w_rsp,
                ));
            }
        }
        // Restore regs.
        let clobbered = get_callee_saves(self.clobbered.to_vec());
        for w_real_reg in clobbered.into_iter().rev() {
            match w_real_reg.to_reg().get_class() {
                RegClass::I64 => {
                    // TODO: make these conversion sequences less cumbersome.
                    insts.push(Inst::pop64(Writable::<Reg>::from_reg(
                        w_real_reg.to_reg().to_reg(),
                    )))
                }
                _ => unimplemented!(),
            }
        }
        // Baldrdash generates its own preamble.
        if !self.call_conv.extends_baldrdash() {
            let r_rbp = regs::rbp();
            let w_rbp = Writable::<Reg>::from_reg(r_rbp);
            // Undo the "traditional" pre-preamble
            // RSP before the call will be 0 % 16.  So here, it is 8 % 16.
            insts.push(Inst::pop64(w_rbp));
            insts.push(Inst::ret());
        }
        insts
    }
    fn frame_size(&self) -> u32 {
        self.frame_size_bytes
            .expect("frame size not computed before prologue generation") as u32
    }
    fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32 {
        // We allocate in terms of 8-byte slots.
        match (rc, ty) {
            (RegClass::I64, _) => 1,
            (RegClass::V128, F32) | (RegClass::V128, F64) => 1,
            (RegClass::V128, _) => 2,
            _ => panic!("Unexpected register class!"),
        }
    }
    fn gen_spill(&self, _to_slot: SpillSlot, _from_reg: RealReg, _ty: Type) -> Inst {
        unimplemented!()
    }
    fn gen_reload(&self, _to_reg: Writable<RealReg>, _from_slot: SpillSlot, _ty: Type) -> Inst {
        unimplemented!()
    }
 }
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -0,0 +1,451 @@
 //! Instruction operand sub-components (aka "parts"): definitions and printing.
 use std::fmt;
 use std::string::{String, ToString};
 use regalloc::{RealRegUniverse, Reg, RegClass, RegUsageCollector};
 use crate::binemit::CodeOffset;
 use crate::machinst::*;
 use super::regs::show_ireg_sized;
 /// A Memory Address. These denote a 64-bit value only.
 #[derive(Clone)]
 pub(crate) enum Addr {
    /// Immediate sign-extended and a Register.
    IR { simm32: u32, base: Reg },
    /// sign-extend-32-to-64(Immediate) + Register1 + (Register2 << Shift)
    IRRS {
        simm32: u32,
        base: Reg,
        index: Reg,
        shift: u8, /* 0 .. 3 only */
    },
 }
 impl Addr {
    // Constructors.
    pub(crate) fn imm_reg(simm32: u32, base: Reg) -> Self {
        debug_assert!(base.get_class() == RegClass::I64);
        Self::IR { simm32, base }
    }
    pub(crate) fn imm_reg_reg_shift(simm32: u32, base: Reg, index: Reg, shift: u8) -> Self {
        debug_assert!(base.get_class() == RegClass::I64);
        debug_assert!(index.get_class() == RegClass::I64);
        debug_assert!(shift <= 3);
        Addr::IRRS {
            simm32,
            base,
            index,
            shift,
        }
    }
    /// Add the regs mentioned by `self` to `collector`.
    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
        match self {
            Addr::IR { simm32: _, base } => {
                collector.add_use(*base);
            }
            Addr::IRRS {
                simm32: _,
                base,
                index,
                shift: _,
            } => {
                collector.add_use(*base);
                collector.add_use(*index);
            }
        }
    }
 }
 impl ShowWithRRU for Addr {
    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
        match self {
            Addr::IR { simm32, base } => format!("{}({})", *simm32 as i32, base.show_rru(mb_rru)),
            Addr::IRRS {
                simm32,
                base,
                index,
                shift,
            } => format!(
                "{}({},{},{})",
                *simm32 as i32,
                base.show_rru(mb_rru),
                index.show_rru(mb_rru),
                1 << shift
            ),
        }
    }
 }
 /// An operand which is either an integer Register, a value in Memory or an Immediate.  This can
 /// denote an 8, 16, 32 or 64 bit value.  For the Immediate form, in the 8- and 16-bit case, only
 /// the lower 8 or 16 bits of `simm32` is relevant.  In the 64-bit case, the value denoted by
 /// `simm32` is its sign-extension out to 64 bits.
 #[derive(Clone)]
 pub(crate) enum RMI {
    R { reg: Reg },
    M { addr: Addr },
    I { simm32: u32 },
 }
 impl RMI {
    // Constructors
    pub(crate) fn reg(reg: Reg) -> RMI {
        debug_assert!(reg.get_class() == RegClass::I64);
        RMI::R { reg }
    }
    pub(crate) fn mem(addr: Addr) -> RMI {
        RMI::M { addr }
    }
    pub(crate) fn imm(simm32: u32) -> RMI {
        RMI::I { simm32 }
    }
    /// Add the regs mentioned by `self` to `collector`.
    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
        match self {
            RMI::R { reg } => collector.add_use(*reg),
            RMI::M { addr } => addr.get_regs_as_uses(collector),
            RMI::I { simm32: _ } => {}
        }
    }
 }
 impl ShowWithRRU for RMI {
    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
        self.show_rru_sized(mb_rru, 8)
    }
    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
        match self {
            RMI::R { reg } => show_ireg_sized(*reg, mb_rru, size),
            RMI::M { addr } => addr.show_rru(mb_rru),
            RMI::I { simm32 } => format!("${}", *simm32 as i32),
        }
    }
 }
 /// An operand which is either an integer Register or a value in Memory.  This can denote an 8, 16,
 /// 32 or 64 bit value.
 #[derive(Clone)]
 pub(crate) enum RM {
    R { reg: Reg },
    M { addr: Addr },
 }
 impl RM {
    // Constructors.
    pub(crate) fn reg(reg: Reg) -> Self {
        debug_assert!(reg.get_class() == RegClass::I64);
        RM::R { reg }
    }
    pub(crate) fn mem(addr: Addr) -> Self {
        RM::M { addr }
    }
    /// Add the regs mentioned by `self` to `collector`.
    pub(crate) fn get_regs_as_uses(&self, collector: &mut RegUsageCollector) {
        match self {
            RM::R { reg } => collector.add_use(*reg),
            RM::M { addr } => addr.get_regs_as_uses(collector),
        }
    }
 }
 impl ShowWithRRU for RM {
    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
        self.show_rru_sized(mb_rru, 8)
    }
    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
        match self {
            RM::R { reg } => show_ireg_sized(*reg, mb_rru, size),
            RM::M { addr } => addr.show_rru(mb_rru),
        }
    }
 }
 /// Some basic ALU operations.  TODO: maybe add Adc, Sbb.
 #[derive(Clone, PartialEq)]
 pub enum RMI_R_Op {
    Add,
    Sub,
    And,
    Or,
    Xor,
    /// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
    Mul,
 }
 impl RMI_R_Op {
    pub(crate) fn to_string(&self) -> String {
        match self {
            RMI_R_Op::Add => "add".to_string(),
            RMI_R_Op::Sub => "sub".to_string(),
            RMI_R_Op::And => "and".to_string(),
            RMI_R_Op::Or => "or".to_string(),
            RMI_R_Op::Xor => "xor".to_string(),
            RMI_R_Op::Mul => "imul".to_string(),
        }
    }
 }
 impl fmt::Debug for RMI_R_Op {
    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
        write!(fmt, "{}", self.to_string())
    }
 }
 /// These indicate ways of extending (widening) a value, using the Intel naming:
 /// B(yte) = u8, W(ord) = u16, L(ong)word = u32, Q(uad)word = u64
 #[derive(Clone, PartialEq)]
 pub enum ExtMode {
    /// Byte -> Longword.
    BL,
    /// Byte -> Quadword.
    BQ,
    /// Word -> Longword.
    WL,
    /// Word -> Quadword.
    WQ,
    /// Longword -> Quadword.
    LQ,
 }
 impl ExtMode {
    pub(crate) fn to_string(&self) -> String {
        match self {
            ExtMode::BL => "bl".to_string(),
            ExtMode::BQ => "bq".to_string(),
            ExtMode::WL => "wl".to_string(),
            ExtMode::WQ => "wq".to_string(),
            ExtMode::LQ => "lq".to_string(),
        }
    }
    pub(crate) fn dst_size(&self) -> u8 {
        match self {
            ExtMode::BL => 4,
            ExtMode::BQ => 8,
            ExtMode::WL => 4,
            ExtMode::WQ => 8,
            ExtMode::LQ => 8,
        }
    }
 }
 impl fmt::Debug for ExtMode {
    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
        write!(fmt, "{}", self.to_string())
    }
 }
 /// These indicate the form of a scalar shift: left, signed right, unsigned right.
 #[derive(Clone)]
 pub enum ShiftKind {
    Left,
    RightZ,
    RightS,
 }
 impl ShiftKind {
    pub(crate) fn to_string(&self) -> String {
        match self {
            ShiftKind::Left => "shl".to_string(),
            ShiftKind::RightZ => "shr".to_string(),
            ShiftKind::RightS => "sar".to_string(),
        }
    }
 }
 impl fmt::Debug for ShiftKind {
    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
        write!(fmt, "{}", self.to_string())
    }
 }
 /// These indicate condition code tests.  Not all are represented since not all are useful in
 /// compiler-generated code.
 #[derive(Copy, Clone)]
 #[repr(u8)]
 pub enum CC {
    ///  overflow
    O = 0,
    /// no overflow
    NO = 1,
    /// < unsigned
    B = 2,
    /// >= unsigned
    NB = 3,
    /// zero
    Z = 4,
    /// not-zero
    NZ = 5,
    /// <= unsigned
    BE = 6,
    /// > unsigend
    NBE = 7,
    /// negative
    S = 8,
    /// not-negative
    NS = 9,
    /// < signed
    L = 12,
    /// >= signed
    NL = 13,
    /// <= signed
    LE = 14,
    /// > signed
    NLE = 15,
 }
 impl CC {
    pub(crate) fn to_string(&self) -> String {
        match self {
            CC::O => "o".to_string(),
            CC::NO => "no".to_string(),
            CC::B => "b".to_string(),
            CC::NB => "nb".to_string(),
            CC::Z => "z".to_string(),
            CC::NZ => "nz".to_string(),
            CC::BE => "be".to_string(),
            CC::NBE => "nbe".to_string(),
            CC::S => "s".to_string(),
            CC::NS => "ns".to_string(),
            CC::L => "l".to_string(),
            CC::NL => "nl".to_string(),
            CC::LE => "le".to_string(),
            CC::NLE => "nle".to_string(),
        }
    }
    pub(crate) fn invert(&self) -> CC {
        match self {
            CC::O => CC::NO,
            CC::NO => CC::O,
            CC::B => CC::NB,
            CC::NB => CC::B,
            CC::Z => CC::NZ,
            CC::NZ => CC::Z,
            CC::BE => CC::NBE,
            CC::NBE => CC::BE,
            CC::S => CC::NS,
            CC::NS => CC::S,
            CC::L => CC::NL,
            CC::NL => CC::L,
            CC::LE => CC::NLE,
            CC::NLE => CC::LE,
        }
    }
    pub(crate) fn get_enc(self) -> u8 {
        self as u8
    }
 }
 impl fmt::Debug for CC {
    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
        write!(fmt, "{}", self.to_string())
    }
 }
 /// A branch target. Either unresolved (basic-block index) or resolved (offset
 /// from end of current instruction).
 #[derive(Clone, Copy, Debug)]
 pub enum BranchTarget {
    /// An unresolved reference to a BlockIndex, as passed into
    /// `lower_branch_group()`.
    Block(BlockIndex),
    /// A resolved reference to another instruction, after
    /// `Inst::with_block_offsets()`.  This offset is in bytes.
    ResolvedOffset(BlockIndex, isize),
 }
 impl ShowWithRRU for BranchTarget {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        match self {
            BranchTarget::Block(bix) => format!("(Block {})", bix),
            BranchTarget::ResolvedOffset(bix, offs) => format!("(Block {}, offset {})", bix, offs),
        }
    }
 }
 impl BranchTarget {
    /// Lower the branch target given offsets of each block.
    pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
        match self {
            &mut BranchTarget::Block(bix) => {
                let bix = bix as usize;
                assert!(bix < targets.len());
                let block_offset_in_func = targets[bix];
                let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
                *self = BranchTarget::ResolvedOffset(bix as BlockIndex, branch_offset);
            }
            &mut BranchTarget::ResolvedOffset(..) => {}
        }
    }
    /// Get the block index.
    pub fn as_block_index(&self) -> Option<BlockIndex> {
        match self {
            &BranchTarget::Block(bix) => Some(bix),
            _ => None,
        }
    }
    /// Get the offset as a signed 32 bit byte offset.  This returns the
    /// offset in bytes between the first byte of the source and the first
    /// byte of the target.  It does not take into account the Intel-specific
    /// rule that a branch offset is encoded as relative to the start of the
    /// following instruction.  That is a problem for the emitter to deal
    /// with.
    pub fn as_offset_i32(&self) -> Option<i32> {
        match self {
            &BranchTarget::ResolvedOffset(_, off) => {
                // Leave a bit of slack so that the emitter is guaranteed to
                // be able to add the length of the jump instruction encoding
                // to this value and still have a value in signed-32 range.
                if off >= -0x7FFF_FF00isize && off <= 0x7FFF_FF00isize {
                    Some(off as i32)
                } else {
                    None
                }
            }
            _ => None,
        }
    }
    /// Map the block index given a transform map.
    pub fn map(&mut self, block_index_map: &[BlockIndex]) {
        match self {
            &mut BranchTarget::Block(ref mut bix) => {
                let n = block_index_map[*bix as usize];
                *bix = n;
            }
            _ => panic!("BranchTarget::map() called on already-lowered BranchTarget!"),
        }
    }
 }
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -0,0 +1,888 @@
 use regalloc::{Reg, RegClass};
 use crate::isa::x64::inst::*;
 fn low8willSXto64(x: u32) -> bool {
    let xs = (x as i32) as i64;
    xs == ((xs << 56) >> 56)
 }
 fn low8willSXto32(x: u32) -> bool {
    let xs = x as i32;
    xs == ((xs << 24) >> 24)
 }
 //=============================================================================
 // Instructions and subcomponents: emission
 // For all of the routines that take both a memory-or-reg operand (sometimes
 // called "E" in the Intel documentation) and a reg-only operand ("G" in
 // Intelese), the order is always G first, then E.
 //
 // "enc" in the following means "hardware register encoding number".
 #[inline(always)]
 fn mkModRegRM(m0d: u8, encRegG: u8, rmE: u8) -> u8 {
    debug_assert!(m0d < 4);
    debug_assert!(encRegG < 8);
    debug_assert!(rmE < 8);
    ((m0d & 3) << 6) | ((encRegG & 7) << 3) | (rmE & 7)
 }
 #[inline(always)]
 fn mkSIB(shift: u8, encIndex: u8, encBase: u8) -> u8 {
    debug_assert!(shift < 4);
    debug_assert!(encIndex < 8);
    debug_assert!(encBase < 8);
    ((shift & 3) << 6) | ((encIndex & 7) << 3) | (encBase & 7)
 }
 /// Get the encoding number from something which we sincerely hope is a real
 /// register of class I64.
 #[inline(always)]
 fn iregEnc(reg: Reg) -> u8 {
    debug_assert!(reg.is_real());
    debug_assert!(reg.get_class() == RegClass::I64);
    reg.get_hw_encoding()
 }
 // F_*: these flags describe special handling of the insn to be generated.  Be
 // careful with these.  It is easy to create nonsensical combinations.
 const F_NONE: u32 = 0;
 /// Emit the REX prefix byte even if it appears to be redundant (== 0x40).
 const F_RETAIN_REDUNDANT_REX: u32 = 1;
 /// Set the W bit in the REX prefix to zero.  By default it will be set to 1,
 /// indicating a 64-bit operation.
 const F_CLEAR_REX_W: u32 = 2;
 /// Add an 0x66 (operand-size override) prefix.  This is necessary to indicate
 /// a 16-bit operation.  Normally this will be used together with F_CLEAR_REX_W.
 const F_PREFIX_66: u32 = 4;
 /// This is the core 'emit' function for instructions that reference memory.
 ///
 /// For an instruction that has as operands a register `encG` and a memory
 /// address `memE`, create and emit, first the REX prefix, then caller-supplied
 /// opcode byte(s) (`opcodes` and `numOpcodes`), then the MOD/RM byte, then
 /// optionally, a SIB byte, and finally optionally an immediate that will be
 /// derived from the `memE` operand.  For most instructions up to and including
 /// SSE4.2, that will be the whole instruction.
 ///
 /// The opcodes are written bigendianly for the convenience of callers.  For
 /// example, if the opcode bytes to be emitted are, in this order, F3 0F 27,
 /// then the caller should pass `opcodes` == 0xF3_0F_27 and `numOpcodes` == 3.
 ///
 /// The register operand is represented here not as a `Reg` but as its hardware
 /// encoding, `encG`.  `flags` can specify special handling for the REX prefix.
 /// By default, the REX prefix will indicate a 64-bit operation and will be
 /// deleted if it is redundant (0x40).  Note that for a 64-bit operation, the
 /// REX prefix will normally never be redundant, since REX.W must be 1 to
 /// indicate a 64-bit operation.
 fn emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE<O: MachSectionOutput>(
    sink: &mut O,
    opcodes: u32,
    mut numOpcodes: usize,
    encG: u8,
    memE: &Addr,
    flags: u32,
 ) {
    // General comment for this function: the registers in `memE` must be
    // 64-bit integer registers, because they are part of an address
    // expression.  But `encG` can be derived from a register of any class.
    let prefix66 = (flags & F_PREFIX_66) != 0;
    let clearRexW = (flags & F_CLEAR_REX_W) != 0;
    let retainRedundant = (flags & F_RETAIN_REDUNDANT_REX) != 0;
    // The operand-size override, if requested.  This indicates a 16-bit
    // operation.
    if prefix66 {
        sink.put1(0x66);
    }
    match memE {
        Addr::IR { simm32, base: regE } => {
            // First, cook up the REX byte.  This is easy.
            let encE = iregEnc(*regE);
            let w = if clearRexW { 0 } else { 1 };
            let r = (encG >> 3) & 1;
            let x = 0;
            let b = (encE >> 3) & 1;
            let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
            if rex != 0x40 || retainRedundant {
                sink.put1(rex);
            }
            // Now the opcode(s).  These include any other prefixes the caller
            // hands to us.
            while numOpcodes > 0 {
                numOpcodes -= 1;
                sink.put1(((opcodes >> (numOpcodes << 3)) & 0xFF) as u8);
            }
            // Now the mod/rm and associated immediates.  This is
            // significantly complicated due to the multiple special cases.
            if *simm32 == 0
                && encE != regs::ENC_RSP
                && encE != regs::ENC_RBP
                && encE != regs::ENC_R12
                && encE != regs::ENC_R13
            {
                // FIXME JRS 2020Feb11: those four tests can surely be
                // replaced by a single mask-and-compare check.  We should do
                // that because this routine is likely to be hot.
                sink.put1(mkModRegRM(0, encG & 7, encE & 7));
            } else if *simm32 == 0 && (encE == regs::ENC_RSP || encE == regs::ENC_R12) {
                sink.put1(mkModRegRM(0, encG & 7, 4));
                sink.put1(0x24);
            } else if low8willSXto32(*simm32) && encE != regs::ENC_RSP && encE != regs::ENC_R12 {
                sink.put1(mkModRegRM(1, encG & 7, encE & 7));
                sink.put1((simm32 & 0xFF) as u8);
            } else if encE != regs::ENC_RSP && encE != regs::ENC_R12 {
                sink.put1(mkModRegRM(2, encG & 7, encE & 7));
                sink.put4(*simm32);
            } else if (encE == regs::ENC_RSP || encE == regs::ENC_R12) && low8willSXto32(*simm32) {
                // REX.B distinguishes RSP from R12
                sink.put1(mkModRegRM(1, encG & 7, 4));
                sink.put1(0x24);
                sink.put1((simm32 & 0xFF) as u8);
            } else if encE == regs::ENC_R12 || encE == regs::ENC_RSP {
                //.. wait for test case for RSP case
                // REX.B distinguishes RSP from R12
                sink.put1(mkModRegRM(2, encG & 7, 4));
                sink.put1(0x24);
                sink.put4(*simm32);
            } else {
                unreachable!("emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE: IR");
            }
        }
        // Bizarrely, the IRRS case is much simpler.
        Addr::IRRS {
            simm32,
            base: regBase,
            index: regIndex,
            shift,
        } => {
            let encBase = iregEnc(*regBase);
            let encIndex = iregEnc(*regIndex);
            // The rex byte
            let w = if clearRexW { 0 } else { 1 };
            let r = (encG >> 3) & 1;
            let x = (encIndex >> 3) & 1;
            let b = (encBase >> 3) & 1;
            let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
            if rex != 0x40 || retainRedundant {
                sink.put1(rex);
            }
            // All other prefixes and opcodes
            while numOpcodes > 0 {
                numOpcodes -= 1;
                sink.put1(((opcodes >> (numOpcodes << 3)) & 0xFF) as u8);
            }
            // modrm, SIB, immediates
            if low8willSXto32(*simm32) && encIndex != regs::ENC_RSP {
                sink.put1(mkModRegRM(1, encG & 7, 4));
                sink.put1(mkSIB(*shift, encIndex & 7, encBase & 7));
                sink.put1(*simm32 as u8);
            } else if encIndex != regs::ENC_RSP {
                sink.put1(mkModRegRM(2, encG & 7, 4));
                sink.put1(mkSIB(*shift, encIndex & 7, encBase & 7));
                sink.put4(*simm32);
            } else {
                panic!("emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE: IRRS");
            }
        }
    }
 }
 /// This is the core 'emit' function for instructions that do not reference
 /// memory.
 ///
 /// This is conceptually the same as
 /// emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE, except it is for the case
 /// where the E operand is a register rather than memory.  Hence it is much
 /// simpler.
 fn emit_REX_OPCODES_MODRM_encG_encE<O: MachSectionOutput>(
    sink: &mut O,
    opcodes: u32,
    mut numOpcodes: usize,
    encG: u8,
    encE: u8,
    flags: u32,
 ) {
    // EncG and EncE can be derived from registers of any class, and they
    // don't even have to be from the same class.  For example, for an
    // integer-to-FP conversion insn, one might be RegClass::I64 and the other
    // RegClass::V128.
    let prefix66 = (flags & F_PREFIX_66) != 0;
    let clearRexW = (flags & F_CLEAR_REX_W) != 0;
    let retainRedundant = (flags & F_RETAIN_REDUNDANT_REX) != 0;
    // The operand-size override
    if prefix66 {
        sink.put1(0x66);
    }
    // The rex byte
    let w = if clearRexW { 0 } else { 1 };
    let r = (encG >> 3) & 1;
    let x = 0;
    let b = (encE >> 3) & 1;
    let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
    if rex != 0x40 || retainRedundant {
        sink.put1(rex);
    }
    // All other prefixes and opcodes
    while numOpcodes > 0 {
        numOpcodes -= 1;
        sink.put1(((opcodes >> (numOpcodes << 3)) & 0xFF) as u8);
    }
    // Now the mod/rm byte.  The instruction we're generating doesn't access
    // memory, so there is no SIB byte or immediate -- we're done.
    sink.put1(mkModRegRM(3, encG & 7, encE & 7));
 }
 // These are merely wrappers for the above two functions that facilitate passing
 // actual `Reg`s rather than their encodings.
 fn emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE<O: MachSectionOutput>(
    sink: &mut O,
    opcodes: u32,
    numOpcodes: usize,
    regG: Reg,
    memE: &Addr,
    flags: u32,
 ) {
    // JRS FIXME 2020Feb07: this should really just be `regEnc` not `iregEnc`
    let encG = iregEnc(regG);
    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(sink, opcodes, numOpcodes, encG, memE, flags);
 }
 fn emit_REX_OPCODES_MODRM_regG_regE<O: MachSectionOutput>(
    sink: &mut O,
    opcodes: u32,
    numOpcodes: usize,
    regG: Reg,
    regE: Reg,
    flags: u32,
 ) {
    // JRS FIXME 2020Feb07: these should really just be `regEnc` not `iregEnc`
    let encG = iregEnc(regG);
    let encE = iregEnc(regE);
    emit_REX_OPCODES_MODRM_encG_encE(sink, opcodes, numOpcodes, encG, encE, flags);
 }
 /// Write a suitable number of bits from an imm64 to the sink.
 fn emit_simm<O: MachSectionOutput>(sink: &mut O, size: u8, simm32: u32) {
    match size {
        8 | 4 => sink.put4(simm32),
        2 => sink.put2(simm32 as u16),
        1 => sink.put1(simm32 as u8),
        _ => panic!("x64::Inst::emit_simm: unreachable"),
    }
 }
 /// The top-level emit function.
 ///
 /// Important!  Do not add improved (shortened) encoding cases to existing
 /// instructions without also adding tests for those improved encodings.  That
 /// is a dangerous game that leads to hard-to-track-down errors in the emitted
 /// code.
 ///
 /// For all instructions, make sure to have test coverage for all of the
 /// following situations.  Do this by creating the cross product resulting from
 /// applying the following rules to each operand:
 ///
 /// (1) for any insn that mentions a register: one test using a register from
 ///     the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
 ///     using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
 ///     This helps detect incorrect REX prefix construction.
 ///
 /// (2) for any insn that mentions a byte register: one test for each of the
 ///     four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
 ///     [r8b .. r11b] and [r12b .. r15b].  This checks that
 ///     apparently-redundant REX prefixes are retained when required.
 ///
 /// (3) for any insn that contains an immediate field, check the following
 ///     cases: field is zero, field is in simm8 range (-128 .. 127), field is
 ///     in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF).  This is because some
 ///     instructions that require a 32-bit immediate have a short-form encoding
 ///     when the imm is in simm8 range.
 ///
 /// Rules (1), (2) and (3) don't apply for registers within address expressions
 /// (`Addr`s).  Those are already pretty well tested, and the registers in them
 /// don't have any effect on the containing instruction (apart from possibly
 /// require REX prefix bits).
 ///
 /// When choosing registers for a test, avoid using registers with the same
 /// offset within a given group.  For example, don't use rax and r8, since they
 /// both have the lowest 3 bits as 000, and so the test won't detect errors
 /// where those 3-bit register sub-fields are confused by the emitter.  Instead
 /// use (eg) rax (lo3 = 000) and r9 (lo3 = 001).  Similarly, don't use (eg) cl
 /// and bpl since they have the same offset in their group; use instead (eg) cl
 /// and sil.
 ///
 /// For all instructions, also add a test that uses only low-half registers
 /// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
 /// prefixes are correctly omitted.  This low-half restriction must apply to
 /// _all_ registers in the insn, even those in address expressions.
 ///
 /// Following these rules creates large numbers of test cases, but it's the
 /// only way to make the emitter reliable.
 ///
 /// Known possible improvements:
 ///
 /// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
 ///   care?)
 pub(crate) fn emit<O: MachSectionOutput>(inst: &Inst, sink: &mut O) {
    match inst {
        Inst::Nop { len: 0 } => {}
        Inst::Alu_RMI_R {
            is_64,
            op,
            src: srcE,
            dst: regG,
        } => {
            let flags = if *is_64 { F_NONE } else { F_CLEAR_REX_W };
            if *op == RMI_R_Op::Mul {
                // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so
                // we have to special-case it.
                match srcE {
                    RMI::R { reg: regE } => {
                        emit_REX_OPCODES_MODRM_regG_regE(
                            sink,
                            0x0FAF,
                            2,
                            regG.to_reg(),
                            *regE,
                            flags,
                        );
                    }
                    RMI::M { addr } => {
                        emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                            sink,
                            0x0FAF,
                            2,
                            regG.to_reg(),
                            addr,
                            flags,
                        );
                    }
                    RMI::I { simm32 } => {
                        let useImm8 = low8willSXto32(*simm32);
                        let opcode = if useImm8 { 0x6B } else { 0x69 };
                        // Yes, really, regG twice.
                        emit_REX_OPCODES_MODRM_regG_regE(
                            sink,
                            opcode,
                            1,
                            regG.to_reg(),
                            regG.to_reg(),
                            flags,
                        );
                        emit_simm(sink, if useImm8 { 1 } else { 4 }, *simm32);
                    }
                }
            } else {
                let (opcode_R, opcode_M, subopcode_I) = match op {
                    RMI_R_Op::Add => (0x01, 0x03, 0),
                    RMI_R_Op::Sub => (0x29, 0x2B, 5),
                    RMI_R_Op::And => (0x21, 0x23, 4),
                    RMI_R_Op::Or => (0x09, 0x0B, 1),
                    RMI_R_Op::Xor => (0x31, 0x33, 6),
                    RMI_R_Op::Mul => panic!("unreachable"),
                };
                match srcE {
                    RMI::R { reg: regE } => {
                        // Note.  The arguments .. regE .. regG .. sequence
                        // here is the opposite of what is expected.  I'm not
                        // sure why this is.  But I am fairly sure that the
                        // arg order could be switched back to the expected
                        // .. regG .. regE .. if opcode_rr is also switched
                        // over to the "other" basic integer opcode (viz, the
                        // R/RM vs RM/R duality).  However, that would mean
                        // that the test results won't be in accordance with
                        // the GNU as reference output.  In other words, the
                        // inversion exists as a result of using GNU as as a
                        // gold standard.
                        emit_REX_OPCODES_MODRM_regG_regE(
                            sink,
                            opcode_R,
                            1,
                            *regE,
                            regG.to_reg(),
                            flags,
                        );
                        // NB: if this is ever extended to handle byte size
                        // ops, be sure to retain redundant REX prefixes.
                    }
                    RMI::M { addr } => {
                        // Whereas here we revert to the "normal" G-E ordering.
                        emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                            sink,
                            opcode_M,
                            1,
                            regG.to_reg(),
                            addr,
                            flags,
                        );
                    }
                    RMI::I { simm32 } => {
                        let useImm8 = low8willSXto32(*simm32);
                        let opcode = if useImm8 { 0x83 } else { 0x81 };
                        // And also here we use the "normal" G-E ordering.
                        let encG = iregEnc(regG.to_reg());
                        emit_REX_OPCODES_MODRM_encG_encE(sink, opcode, 1, subopcode_I, encG, flags);
                        emit_simm(sink, if useImm8 { 1 } else { 4 }, *simm32);
                    }
                }
            }
        }
        Inst::Imm_R {
            dst_is_64,
            simm64,
            dst,
        } => {
            let encDst = iregEnc(dst.to_reg());
            if *dst_is_64 {
                // FIXME JRS 2020Feb10: also use the 32-bit case here when
                // possible
                sink.put1(0x48 | ((encDst >> 3) & 1));
                sink.put1(0xB8 | (encDst & 7));
                sink.put8(*simm64);
            } else {
                if ((encDst >> 3) & 1) == 1 {
                    sink.put1(0x41);
                }
                sink.put1(0xB8 | (encDst & 7));
                sink.put4(*simm64 as u32);
            }
        }
        Inst::Mov_R_R { is_64, src, dst } => {
            let flags = if *is_64 { F_NONE } else { F_CLEAR_REX_W };
            emit_REX_OPCODES_MODRM_regG_regE(sink, 0x89, 1, *src, dst.to_reg(), flags);
        }
        Inst::MovZX_M_R { extMode, addr, dst } => {
            match extMode {
                ExtMode::BL => {
                    // MOVZBL is (REX.W==0) 0F B6 /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x0FB6,
                        2,
                        dst.to_reg(),
                        addr,
                        F_CLEAR_REX_W,
                    )
                }
                ExtMode::BQ => {
                    // MOVZBQ is (REX.W==1) 0F B6 /r
                    // I'm not sure why the Intel manual offers different
                    // encodings for MOVZBQ than for MOVZBL.  AIUI they should
                    // achieve the same, since MOVZBL is just going to zero out
                    // the upper half of the destination anyway.
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x0FB6,
                        2,
                        dst.to_reg(),
                        addr,
                        F_NONE,
                    )
                }
                ExtMode::WL => {
                    // MOVZWL is (REX.W==0) 0F B7 /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x0FB7,
                        2,
                        dst.to_reg(),
                        addr,
                        F_CLEAR_REX_W,
                    )
                }
                ExtMode::WQ => {
                    // MOVZWQ is (REX.W==1) 0F B7 /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x0FB7,
                        2,
                        dst.to_reg(),
                        addr,
                        F_NONE,
                    )
                }
                ExtMode::LQ => {
                    // This is just a standard 32 bit load, and we rely on the
                    // default zero-extension rule to perform the extension.
                    // MOV r/m32, r32 is (REX.W==0) 8B /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x8B,
                        1,
                        dst.to_reg(),
                        addr,
                        F_CLEAR_REX_W,
                    )
                }
            }
        }
        Inst::Mov64_M_R { addr, dst } => {
            emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(sink, 0x8B, 1, dst.to_reg(), addr, F_NONE)
        }
        Inst::MovSX_M_R { extMode, addr, dst } => {
            match extMode {
                ExtMode::BL => {
                    // MOVSBL is (REX.W==0) 0F BE /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x0FBE,
                        2,
                        dst.to_reg(),
                        addr,
                        F_CLEAR_REX_W,
                    )
                }
                ExtMode::BQ => {
                    // MOVSBQ is (REX.W==1) 0F BE /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x0FBE,
                        2,
                        dst.to_reg(),
                        addr,
                        F_NONE,
                    )
                }
                ExtMode::WL => {
                    // MOVSWL is (REX.W==0) 0F BF /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x0FBF,
                        2,
                        dst.to_reg(),
                        addr,
                        F_CLEAR_REX_W,
                    )
                }
                ExtMode::WQ => {
                    // MOVSWQ is (REX.W==1) 0F BF /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x0FBF,
                        2,
                        dst.to_reg(),
                        addr,
                        F_NONE,
                    )
                }
                ExtMode::LQ => {
                    // MOVSLQ is (REX.W==1) 63 /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x63,
                        1,
                        dst.to_reg(),
                        addr,
                        F_NONE,
                    )
                }
            }
        }
        Inst::Mov_R_M { size, src, addr } => {
            match size {
                1 => {
                    // This is one of the few places where the presence of a
                    // redundant REX prefix changes the meaning of the
                    // instruction.
                    let encSrc = iregEnc(*src);
                    let retainRedundantRex = if encSrc >= 4 && encSrc <= 7 {
                        F_RETAIN_REDUNDANT_REX
                    } else {
                        0
                    };
                    // MOV r8, r/m8 is (REX.W==0) 88 /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x88,
                        1,
                        *src,
                        addr,
                        F_CLEAR_REX_W | retainRedundantRex,
                    )
                }
                2 => {
                    // MOV r16, r/m16 is 66 (REX.W==0) 89 /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x89,
                        1,
                        *src,
                        addr,
                        F_CLEAR_REX_W | F_PREFIX_66,
                    )
                }
                4 => {
                    // MOV r32, r/m32 is (REX.W==0) 89 /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
                        sink,
                        0x89,
                        1,
                        *src,
                        addr,
                        F_CLEAR_REX_W,
                    )
                }
                8 => {
                    // MOV r64, r/m64 is (REX.W==1) 89 /r
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(sink, 0x89, 1, *src, addr, F_NONE)
                }
                _ => panic!("x64::Inst::Mov_R_M::emit: unreachable"),
            }
        }
        Inst::Shift_R {
            is_64,
            kind,
            num_bits,
            dst,
        } => {
            let encDst = iregEnc(dst.to_reg());
            let subopcode = match kind {
                ShiftKind::Left => 4,
                ShiftKind::RightZ => 5,
                ShiftKind::RightS => 7,
            };
            match num_bits {
                None => {
                    // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode
                    // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode
                    emit_REX_OPCODES_MODRM_encG_encE(
                        sink,
                        0xD3,
                        1,
                        subopcode,
                        encDst,
                        if *is_64 { F_NONE } else { F_CLEAR_REX_W },
                    );
                }
                Some(num_bits) => {
                    // SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib
                    // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib
                    // When the shift amount is 1, there's an even shorter encoding, but we don't
                    // bother with that nicety here.
                    emit_REX_OPCODES_MODRM_encG_encE(
                        sink,
                        0xC1,
                        1,
                        subopcode,
                        encDst,
                        if *is_64 { F_NONE } else { F_CLEAR_REX_W },
                    );
                    sink.put1(*num_bits);
                }
            }
        }
        Inst::Cmp_RMI_R {
            size,
            src: srcE,
            dst: regG,
        } => {
            let mut retainRedundantRex = 0;
            if *size == 1 {
                // Here, a redundant REX prefix changes the meaning of the
                // instruction.
                let encG = iregEnc(*regG);
                if encG >= 4 && encG <= 7 {
                    retainRedundantRex = F_RETAIN_REDUNDANT_REX;
                }
            }
            let mut flags = match size {
                8 => F_NONE,
                4 => F_CLEAR_REX_W,
                2 => F_CLEAR_REX_W | F_PREFIX_66,
                1 => F_CLEAR_REX_W | retainRedundantRex,
                _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"),
            };
            match srcE {
                RMI::R { reg: regE } => {
                    let opcode = if *size == 1 { 0x38 } else { 0x39 };
                    if *size == 1 {
                        // We also need to check whether the E register forces
                        // the use of a redundant REX.
                        let encE = iregEnc(*regE);
                        if encE >= 4 && encE <= 7 {
                            flags |= F_RETAIN_REDUNDANT_REX;
                        }
                    }
                    // Same comment re swapped args as for Alu_RMI_R.
                    emit_REX_OPCODES_MODRM_regG_regE(sink, opcode, 1, *regE, *regG, flags);
                }
                RMI::M { addr } => {
                    let opcode = if *size == 1 { 0x3A } else { 0x3B };
                    // Whereas here we revert to the "normal" G-E ordering.
                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(sink, opcode, 1, *regG, addr, flags);
                }
                RMI::I { simm32 } => {
                    // FIXME JRS 2020Feb11: there are shorter encodings for
                    // cmp $imm, rax/eax/ax/al.
                    let useImm8 = low8willSXto32(*simm32);
                    let opcode = if *size == 1 {
                        0x80
                    } else if useImm8 {
                        0x83
                    } else {
                        0x81
                    };
                    // And also here we use the "normal" G-E ordering.
                    let encG = iregEnc(*regG);
                    emit_REX_OPCODES_MODRM_encG_encE(
                        sink, opcode, 1, 7, /*subopcode*/
                        encG, flags,
                    );
                    emit_simm(sink, if useImm8 { 1 } else { *size }, *simm32);
                }
            }
        }
        Inst::Push64 { src } => {
            match src {
                RMI::R { reg } => {
                    let encReg = iregEnc(*reg);
                    let rex = 0x40 | ((encReg >> 3) & 1);
                    if rex != 0x40 {
                        sink.put1(rex);
                    }
                    sink.put1(0x50 | (encReg & 7));
                }
                RMI::M { addr } => {
                    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
                        sink,
                        0xFF,
                        1,
                        6, /*subopcode*/
                        addr,
                        F_CLEAR_REX_W,
                    );
                }
                RMI::I { simm32 } => {
                    if low8willSXto64(*simm32) {
                        sink.put1(0x6A);
                        sink.put1(*simm32 as u8);
                    } else {
                        sink.put1(0x68);
                        sink.put4(*simm32);
                    }
                }
            }
        }
        Inst::Pop64 { dst } => {
            let encDst = iregEnc(dst.to_reg());
            if encDst >= 8 {
                // 0x41 == REX.{W=0, B=1}.  It seems that REX.W is irrelevant
                // here.
                sink.put1(0x41);
            }
            sink.put1(0x58 + (encDst & 7));
        }
        //
        // ** Inst::CallKnown
        //
        Inst::CallUnknown { dest } => {
            match dest {
                RM::R { reg } => {
                    let regEnc = iregEnc(*reg);
                    emit_REX_OPCODES_MODRM_encG_encE(
                        sink,
                        0xFF,
                        1,
                        2, /*subopcode*/
                        regEnc,
                        F_CLEAR_REX_W,
                    );
                }
                RM::M { addr } => {
                    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
                        sink,
                        0xFF,
                        1,
                        2, /*subopcode*/
                        addr,
                        F_CLEAR_REX_W,
                    );
                }
            }
        }
        Inst::Ret {} => sink.put1(0xC3),
        Inst::JmpKnown {
            dest: BranchTarget::Block(..),
        } => {
            // Computation of block offsets/sizes.
            sink.put1(0);
            sink.put4(0);
        }
        Inst::JmpKnown {
            dest: BranchTarget::ResolvedOffset(_bix, offset),
        } if *offset >= -0x7FFF_FF00 && *offset <= 0x7FFF_FF00 => {
            // And now for real
            let mut offs_i32 = *offset as i32;
            offs_i32 -= 5;
            let offs_u32 = offs_i32 as u32;
            sink.put1(0xE9);
            sink.put4(offs_u32);
        }
        //
        // ** Inst::JmpCondSymm   XXXX should never happen
        //
        Inst::JmpCond {
            cc: _,
            target: BranchTarget::Block(..),
        } => {
            // This case occurs when we are computing block offsets / sizes,
            // prior to lowering block-index targets to concrete-offset targets.
            // Only the size matters, so let's emit 6 bytes, as below.
            sink.put1(0);
            sink.put1(0);
            sink.put4(0);
        }
        Inst::JmpCond {
            cc,
            target: BranchTarget::ResolvedOffset(_bix, offset),
        } if *offset >= -0x7FFF_FF00 && *offset <= 0x7FFF_FF00 => {
            // This insn is 6 bytes long.  Currently `offset` is relative to
            // the start of this insn, but the Intel encoding requires it to
            // be relative to the start of the next instruction.  Hence the
            // adjustment.
            let mut offs_i32 = *offset as i32;
            offs_i32 -= 6;
            let offs_u32 = offs_i32 as u32;
            sink.put1(0x0F);
            sink.put1(0x80 + cc.get_enc());
            sink.put4(offs_u32);
        }
        //
        // ** Inst::JmpCondCompound   XXXX should never happen
        //
        Inst::JmpUnknown { target } => {
            match target {
                RM::R { reg } => {
                    let regEnc = iregEnc(*reg);
                    emit_REX_OPCODES_MODRM_encG_encE(
                        sink,
                        0xFF,
                        1,
                        4, /*subopcode*/
                        regEnc,
                        F_CLEAR_REX_W,
                    );
                }
                RM::M { addr } => {
                    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
                        sink,
                        0xFF,
                        1,
                        4, /*subopcode*/
                        addr,
                        F_CLEAR_REX_W,
                    );
                }
            }
        }
        _ => panic!("x64_emit: unhandled: {} ", inst.show_rru(None)),
    }
 }
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -0,0 +1,956 @@
 //! This module defines x86_64-specific machine instruction types.
 #![allow(dead_code)]
 #![allow(non_snake_case)]
 #![allow(non_camel_case_types)]
 use std::fmt;
 use std::string::{String, ToString};
 use regalloc::RegUsageCollector;
 use regalloc::Set;
 use regalloc::{RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable};
 use crate::binemit::CodeOffset;
 use crate::ir::types::{B1, B128, B16, B32, B64, B8, F32, F64, I128, I16, I32, I64, I8};
 use crate::ir::ExternalName;
 use crate::ir::Type;
 use crate::machinst::*;
 use crate::{settings, CodegenError, CodegenResult};
 pub mod args;
 mod emit;
 #[cfg(test)]
 mod emit_tests;
 pub mod regs;
 use args::*;
 use regs::{create_reg_universe_systemv, show_ireg_sized};
 //=============================================================================
 // Instructions (top level): definition
 // Don't build these directly.  Instead use the Inst:: functions to create them.
 /// Instructions.  Destinations are on the RIGHT (a la AT&T syntax).
 #[derive(Clone)]
 pub(crate) enum Inst {
    /// nops of various sizes, including zero
    Nop { len: u8 },
    /// (add sub and or xor mul adc? sbb?) (32 64) (reg addr imm) reg
    Alu_RMI_R {
        is_64: bool,
        op: RMI_R_Op,
        src: RMI,
        dst: Writable<Reg>,
    },
    /// (imm32 imm64) reg.
    /// Either: movl $imm32, %reg32 or movabsq $imm64, %reg32
    Imm_R {
        dst_is_64: bool,
        simm64: u64,
        dst: Writable<Reg>,
    },
    /// mov (64 32) reg reg
    Mov_R_R {
        is_64: bool,
        src: Reg,
        dst: Writable<Reg>,
    },
    /// movz (bl bq wl wq lq) addr reg (good for all ZX loads except 64->64).
    /// Note that the lq variant doesn't really exist since the default
    /// zero-extend rule makes it unnecessary.  For that case we emit the
    /// equivalent "movl AM, reg32".
    MovZX_M_R {
        extMode: ExtMode,
        addr: Addr,
        dst: Writable<Reg>,
    },
    /// A plain 64-bit integer load, since MovZX_M_R can't represent that
    Mov64_M_R { addr: Addr, dst: Writable<Reg> },
    /// movs (bl bq wl wq lq) addr reg (good for all SX loads)
    MovSX_M_R {
        extMode: ExtMode,
        addr: Addr,
        dst: Writable<Reg>,
    },
    /// mov (b w l q) reg addr (good for all integer stores)
    Mov_R_M {
        size: u8, // 1, 2, 4 or 8
        src: Reg,
        addr: Addr,
    },
    /// (shl shr sar) (l q) imm reg
    Shift_R {
        is_64: bool,
        kind: ShiftKind,
        /// shift count: Some(0 .. #bits-in-type - 1), or None to mean "%cl".
        num_bits: Option<u8>,
        dst: Writable<Reg>,
    },
    /// cmp (b w l q) (reg addr imm) reg
    Cmp_RMI_R {
        size: u8, // 1, 2, 4 or 8
        src: RMI,
        dst: Reg,
    },
    /// pushq (reg addr imm)
    Push64 { src: RMI },
    /// popq reg
    Pop64 { dst: Writable<Reg> },
    /// call simm32
    CallKnown {
        dest: ExternalName,
        uses: Set<Reg>,
        defs: Set<Writable<Reg>>,
    },
    /// callq (reg mem)
    CallUnknown {
        dest: RM,
        //uses: Set<Reg>,
        //defs: Set<Writable<Reg>>,
    },
    // ---- branches (exactly one must appear at end of BB) ----
    /// ret
    Ret,
    /// A placeholder instruction, generating no code, meaning that a function epilogue must be
    /// inserted there.
    EpiloguePlaceholder,
    /// jmp simm32
    JmpKnown { dest: BranchTarget },
    /// jcond cond target target
    // Symmetrical two-way conditional branch.
    // Should never reach the emitter.
    JmpCondSymm {
        cc: CC,
        taken: BranchTarget,
        not_taken: BranchTarget,
    },
    /// Lowered conditional branch: contains the original instruction, and a
    /// flag indicating whether to invert the taken-condition or not. Only one
    /// BranchTarget is retained, and the other is implicitly the next
    /// instruction, given the final basic-block layout.
    JmpCond {
        cc: CC,
        //inverted: bool, is this needed?
        target: BranchTarget,
    },
    /// As for `CondBrLowered`, but represents a condbr/uncond-br sequence (two
    /// actual machine instructions). Needed when the final block layout implies
    /// that neither arm of a conditional branch targets the fallthrough block.
    // Should never reach the emitter
    JmpCondCompound {
        cc: CC,
        taken: BranchTarget,
        not_taken: BranchTarget,
    },
    /// jmpq (reg mem)
    JmpUnknown { target: RM },
 }
 // Handy constructors for Insts.
 // For various sizes, will some number of lowest bits sign extend to be the
 // same as the whole value?
 pub(crate) fn low32willSXto64(x: u64) -> bool {
    let xs = x as i64;
    xs == ((xs << 32) >> 32)
 }
 impl Inst {
    pub(crate) fn nop(len: u8) -> Self {
        debug_assert!(len <= 16);
        Self::Nop { len }
    }
    pub(crate) fn alu_rmi_r(is_64: bool, op: RMI_R_Op, src: RMI, dst: Writable<Reg>) -> Self {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
        Self::Alu_RMI_R {
            is_64,
            op,
            src,
            dst,
        }
    }
    pub(crate) fn imm_r(dst_is_64: bool, simm64: u64, dst: Writable<Reg>) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
        if !dst_is_64 {
            debug_assert!(low32willSXto64(simm64));
        }
        Inst::Imm_R {
            dst_is_64,
            simm64,
            dst,
        }
    }
    pub(crate) fn mov_r_r(is_64: bool, src: Reg, dst: Writable<Reg>) -> Inst {
        debug_assert!(src.get_class() == RegClass::I64);
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
        Inst::Mov_R_R { is_64, src, dst }
    }
    pub(crate) fn movzx_m_r(extMode: ExtMode, addr: Addr, dst: Writable<Reg>) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
        Inst::MovZX_M_R { extMode, addr, dst }
    }
    pub(crate) fn mov64_m_r(addr: Addr, dst: Writable<Reg>) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
        Inst::Mov64_M_R { addr, dst }
    }
    pub(crate) fn movsx_m_r(extMode: ExtMode, addr: Addr, dst: Writable<Reg>) -> Inst {
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
        Inst::MovSX_M_R { extMode, addr, dst }
    }
    pub(crate) fn mov_r_m(
        size: u8, // 1, 2, 4 or 8
        src: Reg,
        addr: Addr,
    ) -> Inst {
        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
        debug_assert!(src.get_class() == RegClass::I64);
        Inst::Mov_R_M { size, src, addr }
    }
    pub(crate) fn shift_r(
        is_64: bool,
        kind: ShiftKind,
        num_bits: Option<u8>,
        dst: Writable<Reg>,
    ) -> Inst {
        debug_assert!(if let Some(num_bits) = num_bits {
            num_bits < if is_64 { 64 } else { 32 }
        } else {
            true
        });
        debug_assert!(dst.to_reg().get_class() == RegClass::I64);
        Inst::Shift_R {
            is_64,
            kind,
            num_bits,
            dst,
        }
    }
    pub(crate) fn cmp_rmi_r(
        size: u8, // 1, 2, 4 or 8
        src: RMI,
        dst: Reg,
    ) -> Inst {
        debug_assert!(size == 8 || size == 4 || size == 2 || size == 1);
        debug_assert!(dst.get_class() == RegClass::I64);
        Inst::Cmp_RMI_R { size, src, dst }
    }
    pub(crate) fn push64(src: RMI) -> Inst {
        Inst::Push64 { src }
    }
    pub(crate) fn pop64(dst: Writable<Reg>) -> Inst {
        Inst::Pop64 { dst }
    }
    pub(crate) fn call_unknown(dest: RM) -> Inst {
        Inst::CallUnknown { dest }
    }
    pub(crate) fn ret() -> Inst {
        Inst::Ret
    }
    pub(crate) fn epilogue_placeholder() -> Inst {
        Inst::EpiloguePlaceholder
    }
    pub(crate) fn jmp_known(dest: BranchTarget) -> Inst {
        Inst::JmpKnown { dest }
    }
    pub(crate) fn jmp_cond_symm(cc: CC, taken: BranchTarget, not_taken: BranchTarget) -> Inst {
        Inst::JmpCondSymm {
            cc,
            taken,
            not_taken,
        }
    }
    pub(crate) fn jmp_cond(cc: CC, target: BranchTarget) -> Inst {
        Inst::JmpCond { cc, target }
    }
    pub(crate) fn jmp_cond_compound(cc: CC, taken: BranchTarget, not_taken: BranchTarget) -> Inst {
        Inst::JmpCondCompound {
            cc,
            taken,
            not_taken,
        }
    }
    pub(crate) fn jmp_unknown(target: RM) -> Inst {
        Inst::JmpUnknown { target }
    }
 }
 //=============================================================================
 // Instructions: printing
 impl ShowWithRRU for Inst {
    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
        fn ljustify(s: String) -> String {
            let w = 7;
            if s.len() >= w {
                s
            } else {
                let need = usize::min(w, w - s.len());
                s + &format!("{nil: <width$}", nil = "", width = need)
            }
        }
        fn ljustify2(s1: String, s2: String) -> String {
            ljustify(s1 + &s2)
        }
        fn suffixLQ(is_64: bool) -> String {
            (if is_64 { "q" } else { "l" }).to_string()
        }
        fn sizeLQ(is_64: bool) -> u8 {
            if is_64 {
                8
            } else {
                4
            }
        }
        fn suffixBWLQ(size: u8) -> String {
            match size {
                1 => "b".to_string(),
                2 => "w".to_string(),
                4 => "l".to_string(),
                8 => "q".to_string(),
                _ => panic!("Inst(x64).show.suffixBWLQ: size={}", size),
            }
        }
        match self {
            Inst::Nop { len } => format!("{} len={}", ljustify("nop".to_string()), len),
            Inst::Alu_RMI_R {
                is_64,
                op,
                src,
                dst,
            } => format!(
                "{} {}, {}",
                ljustify2(op.to_string(), suffixLQ(*is_64)),
                src.show_rru_sized(mb_rru, sizeLQ(*is_64)),
                show_ireg_sized(dst.to_reg(), mb_rru, sizeLQ(*is_64)),
            ),
            Inst::Imm_R {
                dst_is_64,
                simm64,
                dst,
            } => {
                if *dst_is_64 {
                    format!(
                        "{} ${}, {}",
                        ljustify("movabsq".to_string()),
                        *simm64 as i64,
                        show_ireg_sized(dst.to_reg(), mb_rru, 8)
                    )
                } else {
                    format!(
                        "{} ${}, {}",
                        ljustify("movl".to_string()),
                        (*simm64 as u32) as i32,
                        show_ireg_sized(dst.to_reg(), mb_rru, 4)
                    )
                }
            }
            Inst::Mov_R_R { is_64, src, dst } => format!(
                "{} {}, {}",
                ljustify2("mov".to_string(), suffixLQ(*is_64)),
                show_ireg_sized(*src, mb_rru, sizeLQ(*is_64)),
                show_ireg_sized(dst.to_reg(), mb_rru, sizeLQ(*is_64))
            ),
            Inst::MovZX_M_R { extMode, addr, dst } => {
                if *extMode == ExtMode::LQ {
                    format!(
                        "{} {}, {}",
                        ljustify("movl".to_string()),
                        addr.show_rru(mb_rru),
                        show_ireg_sized(dst.to_reg(), mb_rru, 4)
                    )
                } else {
                    format!(
                        "{} {}, {}",
                        ljustify2("movz".to_string(), extMode.to_string()),
                        addr.show_rru(mb_rru),
                        show_ireg_sized(dst.to_reg(), mb_rru, extMode.dst_size())
                    )
                }
            }
            Inst::Mov64_M_R { addr, dst } => format!(
                "{} {}, {}",
                ljustify("movq".to_string()),
                addr.show_rru(mb_rru),
                dst.show_rru(mb_rru)
            ),
            Inst::MovSX_M_R { extMode, addr, dst } => format!(
                "{} {}, {}",
                ljustify2("movs".to_string(), extMode.to_string()),
                addr.show_rru(mb_rru),
                show_ireg_sized(dst.to_reg(), mb_rru, extMode.dst_size())
            ),
            Inst::Mov_R_M { size, src, addr } => format!(
                "{} {}, {}",
                ljustify2("mov".to_string(), suffixBWLQ(*size)),
                show_ireg_sized(*src, mb_rru, *size),
                addr.show_rru(mb_rru)
            ),
            Inst::Shift_R {
                is_64,
                kind,
                num_bits,
                dst,
            } => match num_bits {
                None => format!(
                    "{} %cl, {}",
                    ljustify2(kind.to_string(), suffixLQ(*is_64)),
                    show_ireg_sized(dst.to_reg(), mb_rru, sizeLQ(*is_64))
                ),
                Some(num_bits) => format!(
                    "{} ${}, {}",
                    ljustify2(kind.to_string(), suffixLQ(*is_64)),
                    num_bits,
                    show_ireg_sized(dst.to_reg(), mb_rru, sizeLQ(*is_64))
                ),
            },
            Inst::Cmp_RMI_R { size, src, dst } => format!(
                "{} {}, {}",
                ljustify2("cmp".to_string(), suffixBWLQ(*size)),
                src.show_rru_sized(mb_rru, *size),
                show_ireg_sized(*dst, mb_rru, *size)
            ),
            Inst::Push64 { src } => {
                format!("{} {}", ljustify("pushq".to_string()), src.show_rru(mb_rru))
            }
            Inst::Pop64 { dst } => {
                format!("{} {}", ljustify("popq".to_string()), dst.show_rru(mb_rru))
            }
            //Inst::CallKnown { target } => format!("{} {:?}", ljustify("call".to_string()), target),
            Inst::CallKnown { .. } => "**CallKnown**".to_string(),
            Inst::CallUnknown { dest } => format!(
                "{} *{}",
                ljustify("call".to_string()),
                dest.show_rru(mb_rru)
            ),
            Inst::Ret => "ret".to_string(),
            Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(),
            Inst::JmpKnown { dest } => {
                format!("{} {}", ljustify("jmp".to_string()), dest.show_rru(mb_rru))
            }
            Inst::JmpCondSymm {
                cc,
                taken,
                not_taken,
            } => format!(
                "{} taken={} not_taken={}",
                ljustify2("j".to_string(), cc.to_string()),
                taken.show_rru(mb_rru),
                not_taken.show_rru(mb_rru)
            ),
            //
            Inst::JmpCond { cc, ref target } => format!(
                "{} {}",
                ljustify2("j".to_string(), cc.to_string()),
                target.show_rru(None)
            ),
            //
            Inst::JmpCondCompound { .. } => "**JmpCondCompound**".to_string(),
            Inst::JmpUnknown { target } => format!(
                "{} *{}",
                ljustify("jmp".to_string()),
                target.show_rru(mb_rru)
            ),
        }
    }
 }
 // Temp hook for legacy printing machinery
 impl fmt::Debug for Inst {
    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
        // Print the insn without a Universe :-(
        write!(fmt, "{}", self.show_rru(None))
    }
 }
 fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
    // This is a bit subtle. If some register is in the modified set, then it may not be in either
    // the use or def sets. However, enforcing that directly is somewhat difficult. Instead,
    // regalloc.rs will "fix" this for us by removing the the modified set from the use and def
    // sets.
    match inst {
        // ** Nop
        Inst::Alu_RMI_R {
            is_64: _,
            op: _,
            src,
            dst,
        } => {
            src.get_regs_as_uses(collector);
            collector.add_mod(*dst);
        }
        Inst::Imm_R {
            dst_is_64: _,
            simm64: _,
            dst,
        } => {
            collector.add_def(*dst);
        }
        Inst::Mov_R_R { is_64: _, src, dst } => {
            collector.add_use(*src);
            collector.add_def(*dst);
        }
        Inst::MovZX_M_R {
            extMode: _,
            addr,
            dst,
        } => {
            addr.get_regs_as_uses(collector);
            collector.add_def(*dst);
        }
        Inst::Mov64_M_R { addr, dst } => {
            addr.get_regs_as_uses(collector);
            collector.add_def(*dst);
        }
        Inst::MovSX_M_R {
            extMode: _,
            addr,
            dst,
        } => {
            addr.get_regs_as_uses(collector);
            collector.add_def(*dst);
        }
        Inst::Mov_R_M { size: _, src, addr } => {
            collector.add_use(*src);
            addr.get_regs_as_uses(collector);
        }
        Inst::Shift_R {
            is_64: _,
            kind: _,
            num_bits,
            dst,
        } => {
            if num_bits.is_none() {
                collector.add_use(regs::rcx());
            }
            collector.add_mod(*dst);
        }
        Inst::Cmp_RMI_R { size: _, src, dst } => {
            src.get_regs_as_uses(collector);
            collector.add_use(*dst); // yes, really `add_use`
        }
        Inst::Push64 { src } => {
            src.get_regs_as_uses(collector);
            collector.add_mod(Writable::from_reg(regs::rsp()));
        }
        Inst::Pop64 { dst } => {
            collector.add_def(*dst);
        }
        Inst::CallKnown {
            dest: _,
            uses: _,
            defs: _,
        } => {
            // FIXME add arg regs (iru.used) and caller-saved regs (iru.defined)
            unimplemented!();
        }
        Inst::CallUnknown { dest } => {
            dest.get_regs_as_uses(collector);
        }
        Inst::Ret => {}
        Inst::EpiloguePlaceholder => {}
        Inst::JmpKnown { dest: _ } => {}
        Inst::JmpCondSymm {
            cc: _,
            taken: _,
            not_taken: _,
        } => {}
        //
        // ** JmpCond
        //
        // ** JmpCondCompound
        //
        //Inst::JmpUnknown { target } => {
        //    target.get_regs_as_uses(collector);
        //}
        Inst::Nop { .. }
        | Inst::JmpCond { .. }
        | Inst::JmpCondCompound { .. }
        | Inst::JmpUnknown { .. } => unimplemented!("x64_get_regs inst"),
    }
 }
 //=============================================================================
 // Instructions and subcomponents: map_regs
 fn map_use(m: &RegUsageMapper, r: &mut Reg) {
    if r.is_virtual() {
        let new = m.get_use(r.to_virtual_reg()).unwrap().to_reg();
        *r = new;
    }
 }
 fn map_def(m: &RegUsageMapper, r: &mut Writable<Reg>) {
    if r.to_reg().is_virtual() {
        let new = m.get_def(r.to_reg().to_virtual_reg()).unwrap().to_reg();
        *r = Writable::from_reg(new);
    }
 }
 fn map_mod(m: &RegUsageMapper, r: &mut Writable<Reg>) {
    if r.to_reg().is_virtual() {
        let new = m.get_mod(r.to_reg().to_virtual_reg()).unwrap().to_reg();
        *r = Writable::from_reg(new);
    }
 }
 impl Addr {
    fn map_uses(&mut self, map: &RegUsageMapper) {
        match self {
            Addr::IR {
                simm32: _,
                ref mut base,
            } => map_use(map, base),
            Addr::IRRS {
                simm32: _,
                ref mut base,
                ref mut index,
                shift: _,
            } => {
                map_use(map, base);
                map_use(map, index);
            }
        }
    }
 }
 impl RMI {
    fn map_uses(&mut self, map: &RegUsageMapper) {
        match self {
            RMI::R { ref mut reg } => map_use(map, reg),
            RMI::M { ref mut addr } => addr.map_uses(map),
            RMI::I { simm32: _ } => {}
        }
    }
 }
 impl RM {
    fn map_uses(&mut self, map: &RegUsageMapper) {
        match self {
            RM::R { ref mut reg } => map_use(map, reg),
            RM::M { ref mut addr } => addr.map_uses(map),
        }
    }
 }
 fn x64_map_regs(inst: &mut Inst, mapper: &RegUsageMapper) {
    // Note this must be carefully synchronized with x64_get_regs.
    match inst {
        // ** Nop
        Inst::Alu_RMI_R {
            is_64: _,
            op: _,
            ref mut src,
            ref mut dst,
        } => {
            src.map_uses(mapper);
            map_mod(mapper, dst);
        }
        Inst::Imm_R {
            dst_is_64: _,
            simm64: _,
            ref mut dst,
        } => map_def(mapper, dst),
        Inst::Mov_R_R {
            is_64: _,
            ref mut src,
            ref mut dst,
        } => {
            map_use(mapper, src);
            map_def(mapper, dst);
        }
        Inst::MovZX_M_R {
            extMode: _,
            ref mut addr,
            ref mut dst,
        } => {
            addr.map_uses(mapper);
            map_def(mapper, dst);
        }
        Inst::Mov64_M_R { addr, dst } => {
            addr.map_uses(mapper);
            map_def(mapper, dst);
        }
        Inst::MovSX_M_R {
            extMode: _,
            ref mut addr,
            ref mut dst,
        } => {
            addr.map_uses(mapper);
            map_def(mapper, dst);
        }
        Inst::Mov_R_M {
            size: _,
            ref mut src,
            ref mut addr,
        } => {
            map_use(mapper, src);
            addr.map_uses(mapper);
        }
        Inst::Shift_R {
            is_64: _,
            kind: _,
            num_bits: _,
            ref mut dst,
        } => {
            map_mod(mapper, dst);
        }
        Inst::Cmp_RMI_R {
            size: _,
            ref mut src,
            ref mut dst,
        } => {
            src.map_uses(mapper);
            map_use(mapper, dst);
        }
        Inst::Push64 { ref mut src } => src.map_uses(mapper),
        Inst::Pop64 { ref mut dst } => {
            map_def(mapper, dst);
        }
        Inst::CallKnown {
            dest: _,
            uses: _,
            defs: _,
        } => {}
        Inst::CallUnknown { dest } => dest.map_uses(mapper),
        Inst::Ret => {}
        Inst::EpiloguePlaceholder => {}
        Inst::JmpKnown { dest: _ } => {}
        Inst::JmpCondSymm {
            cc: _,
            taken: _,
            not_taken: _,
        } => {}
        //
        // ** JmpCond
        //
        // ** JmpCondCompound
        //
        //Inst::JmpUnknown { target } => {
        //    target.apply_map(mapper);
        //}
        Inst::Nop { .. }
        | Inst::JmpCond { .. }
        | Inst::JmpCondCompound { .. }
        | Inst::JmpUnknown { .. } => unimplemented!("x64_map_regs opcode"),
    }
 }
 //=============================================================================
 // Instructions: misc functions and external interface
 impl MachInst for Inst {
    fn get_regs(&self, collector: &mut RegUsageCollector) {
        x64_get_regs(&self, collector)
    }
    fn map_regs(&mut self, mapper: &RegUsageMapper) {
        x64_map_regs(self, mapper);
    }
    fn is_move(&self) -> Option<(Writable<Reg>, Reg)> {
        // Note (carefully!) that a 32-bit mov *isn't* a no-op since it zeroes
        // out the upper 32 bits of the destination.  For example, we could
        // conceivably use `movl %reg, %reg` to zero out the top 32 bits of
        // %reg.
        match self {
            Self::Mov_R_R { is_64, src, dst } if *is_64 => Some((*dst, *src)),
            _ => None,
        }
    }
    fn is_epilogue_placeholder(&self) -> bool {
        if let Self::EpiloguePlaceholder = self {
            true
        } else {
            false
        }
    }
    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
        match self {
            // Interesting cases.
            &Self::Ret | &Self::EpiloguePlaceholder => MachTerminator::Ret,
            &Self::JmpKnown { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()),
            &Self::JmpCondSymm {
                cc: _,
                taken,
                not_taken,
            } => MachTerminator::Cond(
                taken.as_block_index().unwrap(),
                not_taken.as_block_index().unwrap(),
            ),
            &Self::JmpCond { .. } | &Self::JmpCondCompound { .. } => {
                panic!("is_term() called after lowering branches");
            }
            // All other cases are boring.
            _ => MachTerminator::None,
        }
    }
    fn gen_move(dst_reg: Writable<Reg>, src_reg: Reg, _ty: Type) -> Inst {
        let rc_dst = dst_reg.to_reg().get_class();
        let rc_src = src_reg.get_class();
        // If this isn't true, we have gone way off the rails.
        debug_assert!(rc_dst == rc_src);
        match rc_dst {
            RegClass::I64 => Inst::mov_r_r(true, src_reg, dst_reg),
            _ => panic!("gen_move(x64): unhandled regclass"),
        }
    }
    fn gen_zero_len_nop() -> Inst {
        unimplemented!()
    }
    fn gen_nop(_preferred_size: usize) -> Inst {
        unimplemented!()
    }
    fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> {
        None
    }
    fn rc_for_type(ty: Type) -> CodegenResult<RegClass> {
        match ty {
            I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => Ok(RegClass::I64),
            F32 | F64 | I128 | B128 => Ok(RegClass::V128),
            _ => Err(CodegenError::Unsupported(format!(
                "Unexpected SSA-value type: {}",
                ty
            ))),
        }
    }
    fn gen_jump(blockindex: BlockIndex) -> Inst {
        Inst::jmp_known(BranchTarget::Block(blockindex))
    }
    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]) {
        // This is identical (modulo renaming) to the arm64 version.
        match self {
            &mut Inst::JmpKnown { ref mut dest } => {
                dest.map(block_target_map);
            }
            &mut Inst::JmpCondSymm {
                cc: _,
                ref mut taken,
                ref mut not_taken,
            } => {
                taken.map(block_target_map);
                not_taken.map(block_target_map);
            }
            &mut Inst::JmpCond { .. } | &mut Inst::JmpCondCompound { .. } => {
                panic!("with_block_rewrites called after branch lowering!");
            }
            _ => {}
        }
    }
    fn with_fallthrough_block(&mut self, fallthrough: Option<BlockIndex>) {
        // This is identical (modulo renaming) to the arm64 version.
        match self {
            &mut Inst::JmpCondSymm {
                cc,
                taken,
                not_taken,
            } => {
                if taken.as_block_index() == fallthrough {
                    *self = Inst::jmp_cond(cc.invert(), not_taken);
                } else if not_taken.as_block_index() == fallthrough {
                    *self = Inst::jmp_cond(cc, taken);
                } else {
                    // We need a compound sequence (condbr / uncond-br).
                    *self = Inst::jmp_cond_compound(cc, taken, not_taken);
                }
            }
            &mut Inst::JmpKnown { dest } => {
                if dest.as_block_index() == fallthrough {
                    *self = Inst::nop(0);
                }
            }
            _ => {}
        }
    }
    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]) {
        // This is identical (modulo renaming) to the arm64 version.
        match self {
            &mut Self::JmpCond {
                cc: _,
                ref mut target,
            } => {
                target.lower(targets, my_offset);
            }
            &mut Self::JmpCondCompound {
                cc: _,
                ref mut taken,
                ref mut not_taken,
                ..
            } => {
                taken.lower(targets, my_offset);
                not_taken.lower(targets, my_offset);
            }
            &mut Self::JmpKnown { ref mut dest } => {
                dest.lower(targets, my_offset);
            }
            _ => {}
        }
    }
    fn reg_universe(flags: &settings::Flags) -> RealRegUniverse {
        create_reg_universe_systemv(flags)
    }
 }
 impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
    fn emit(&self, sink: &mut O, _flags: &settings::Flags) {
        emit::emit(self, sink);
    }
 }
--- a/cranelift/codegen/src/isa/x64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/x64/inst/regs.rs
@@ -0,0 +1,261 @@
 //! Registers, the Universe thereof, and printing.
 //!
 //! These are ordered by sequence number, as required in the Universe.  The strange ordering is
 //! intended to make callee-save registers available before caller-saved ones.  This is a net win
 //! provided that each function makes at least one onward call.  It'll be a net loss for leaf
 //! functions, and we should change the ordering in that case, so as to make caller-save regs
 //! available first.
 //!
 //! TODO Maybe have two different universes, one for leaf functions and one for non-leaf functions?
 //! Also, they will have to be ABI dependent.  Need to find a way to avoid constructing a universe
 //! for each function we compile.
 use alloc::vec::Vec;
 use std::string::String;
 use regalloc::{RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, NUM_REG_CLASSES};
 use crate::machinst::pretty_print::ShowWithRRU;
 use crate::settings;
 // Hardware encodings for a few registers.
 pub const ENC_RBX: u8 = 3;
 pub const ENC_RSP: u8 = 4;
 pub const ENC_RBP: u8 = 5;
 pub const ENC_R12: u8 = 12;
 pub const ENC_R13: u8 = 13;
 pub const ENC_R14: u8 = 14;
 pub const ENC_R15: u8 = 15;
 fn gpr(enc: u8, index: u8) -> Reg {
    Reg::new_real(RegClass::I64, enc, index)
 }
 pub(crate) fn r12() -> Reg {
    gpr(ENC_R12, 0)
 }
 pub(crate) fn r13() -> Reg {
    gpr(ENC_R13, 1)
 }
 pub(crate) fn r14() -> Reg {
    gpr(ENC_R14, 2)
 }
 pub(crate) fn r15() -> Reg {
    gpr(ENC_R15, 3)
 }
 pub(crate) fn rbx() -> Reg {
    gpr(ENC_RBX, 4)
 }
 pub(crate) fn rsi() -> Reg {
    gpr(6, 5)
 }
 pub(crate) fn rdi() -> Reg {
    gpr(7, 6)
 }
 pub(crate) fn rax() -> Reg {
    gpr(0, 7)
 }
 pub(crate) fn rcx() -> Reg {
    gpr(1, 8)
 }
 pub(crate) fn rdx() -> Reg {
    gpr(2, 9)
 }
 pub(crate) fn r8() -> Reg {
    gpr(8, 10)
 }
 pub(crate) fn r9() -> Reg {
    gpr(9, 11)
 }
 pub(crate) fn r10() -> Reg {
    gpr(10, 12)
 }
 pub(crate) fn r11() -> Reg {
    gpr(11, 13)
 }
 fn fpr(enc: u8, index: u8) -> Reg {
    Reg::new_real(RegClass::V128, enc, index)
 }
 fn xmm0() -> Reg {
    fpr(0, 14)
 }
 fn xmm1() -> Reg {
    fpr(1, 15)
 }
 fn xmm2() -> Reg {
    fpr(2, 16)
 }
 fn xmm3() -> Reg {
    fpr(3, 17)
 }
 fn xmm4() -> Reg {
    fpr(4, 18)
 }
 fn xmm5() -> Reg {
    fpr(5, 19)
 }
 fn xmm6() -> Reg {
    fpr(6, 20)
 }
 fn xmm7() -> Reg {
    fpr(7, 21)
 }
 fn xmm8() -> Reg {
    fpr(8, 22)
 }
 fn xmm9() -> Reg {
    fpr(9, 23)
 }
 fn xmm10() -> Reg {
    fpr(10, 24)
 }
 fn xmm11() -> Reg {
    fpr(11, 25)
 }
 fn xmm12() -> Reg {
    fpr(12, 26)
 }
 fn xmm13() -> Reg {
    fpr(13, 27)
 }
 fn xmm14() -> Reg {
    fpr(14, 28)
 }
 fn xmm15() -> Reg {
    fpr(15, 29)
 }
 pub(crate) fn rsp() -> Reg {
    gpr(ENC_RSP, 30)
 }
 pub(crate) fn rbp() -> Reg {
    gpr(ENC_RBP, 31)
 }
 /// Create the register universe for X64.
 ///
 /// The ordering of registers matters, as commented in the file doc comment: assumes the
 /// calling-convention is SystemV, at the moment.
 pub(crate) fn create_reg_universe_systemv(_flags: &settings::Flags) -> RealRegUniverse {
    let mut regs = Vec::<(RealReg, String)>::new();
    let mut allocable_by_class = [None; NUM_REG_CLASSES];
    // Integer regs.
    let mut base = regs.len();
    // Callee-saved, in the SystemV x86_64 ABI.
    regs.push((r12().to_real_reg(), "%r12".into()));
    regs.push((r13().to_real_reg(), "%r13".into()));
    regs.push((r14().to_real_reg(), "%r14".into()));
    regs.push((r15().to_real_reg(), "%r15".into()));
    regs.push((rbx().to_real_reg(), "%rbx".into()));
    // Caller-saved, in the SystemV x86_64 ABI.
    regs.push((rsi().to_real_reg(), "%rsi".into()));
    regs.push((rdi().to_real_reg(), "%rdi".into()));
    regs.push((rax().to_real_reg(), "%rax".into()));
    regs.push((rcx().to_real_reg(), "%rcx".into()));
    regs.push((rdx().to_real_reg(), "%rdx".into()));
    regs.push((r8().to_real_reg(), "%r8".into()));
    regs.push((r9().to_real_reg(), "%r9".into()));
    regs.push((r10().to_real_reg(), "%r10".into()));
    regs.push((r11().to_real_reg(), "%r11".into()));
    allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo {
        first: base,
        last: regs.len() - 1,
        suggested_scratch: Some(r12().get_index()),
    });
    // XMM registers
    base = regs.len();
    regs.push((xmm0().to_real_reg(), "%xmm0".into()));
    regs.push((xmm1().to_real_reg(), "%xmm1".into()));
    regs.push((xmm2().to_real_reg(), "%xmm2".into()));
    regs.push((xmm3().to_real_reg(), "%xmm3".into()));
    regs.push((xmm4().to_real_reg(), "%xmm4".into()));
    regs.push((xmm5().to_real_reg(), "%xmm5".into()));
    regs.push((xmm6().to_real_reg(), "%xmm6".into()));
    regs.push((xmm7().to_real_reg(), "%xmm7".into()));
    regs.push((xmm8().to_real_reg(), "%xmm8".into()));
    regs.push((xmm9().to_real_reg(), "%xmm9".into()));
    regs.push((xmm10().to_real_reg(), "%xmm10".into()));
    regs.push((xmm11().to_real_reg(), "%xmm11".into()));
    regs.push((xmm12().to_real_reg(), "%xmm12".into()));
    regs.push((xmm13().to_real_reg(), "%xmm13".into()));
    regs.push((xmm14().to_real_reg(), "%xmm14".into()));
    regs.push((xmm15().to_real_reg(), "%xmm15".into()));
    allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo {
        first: base,
        last: regs.len() - 1,
        suggested_scratch: Some(xmm15().get_index()),
    });
    // Other regs, not available to the allocator.
    let allocable = regs.len();
    regs.push((rsp().to_real_reg(), "%rsp".into()));
    regs.push((rbp().to_real_reg(), "%rbp".into()));
    RealRegUniverse {
        regs,
        allocable,
        allocable_by_class,
    }
 }
 /// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show its name at some
 /// smaller size (4, 2 or 1 bytes).
 pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
    let mut s = reg.show_rru(mb_rru);
    if reg.get_class() != RegClass::I64 || size == 8 {
        // We can't do any better.
        return s;
    }
    if reg.is_real() {
        // Change (eg) "rax" into "eax", "ax" or "al" as appropriate.  This is something one could
        // describe diplomatically as "a kludge", but it's only debug code.
        let remapper = match s.as_str() {
            "%rax" => Some(["%eax", "%ax", "%al"]),
            "%rbx" => Some(["%ebx", "%bx", "%bl"]),
            "%rcx" => Some(["%ecx", "%cx", "%cl"]),
            "%rdx" => Some(["%edx", "%dx", "%dl"]),
            "%rsi" => Some(["%esi", "%si", "%sil"]),
            "%rdi" => Some(["%edi", "%di", "%dil"]),
            "%rbp" => Some(["%ebp", "%bp", "%bpl"]),
            "%rsp" => Some(["%esp", "%sp", "%spl"]),
            "%r8" => Some(["%r8d", "%r8w", "%r8b"]),
            "%r9" => Some(["%r9d", "%r9w", "%r9b"]),
            "%r10" => Some(["%r10d", "%r10w", "%r10b"]),
            "%r11" => Some(["%r11d", "%r11w", "%r11b"]),
            "%r12" => Some(["%r12d", "%r12w", "%r12b"]),
            "%r13" => Some(["%r13d", "%r13w", "%r13b"]),
            "%r14" => Some(["%r14d", "%r14w", "%r14b"]),
            "%r15" => Some(["%r15d", "%r15w", "%r15b"]),
            _ => None,
        };
        if let Some(smaller_names) = remapper {
            match size {
                4 => s = smaller_names[0].into(),
                2 => s = smaller_names[1].into(),
                1 => s = smaller_names[2].into(),
                _ => panic!("show_ireg_sized: real"),
            }
        }
    } else {
        // Add a "l", "w" or "b" suffix to RegClass::I64 vregs used at narrower widths.
        let suffix = match size {
            4 => "l",
            2 => "w",
            1 => "b",
            _ => panic!("show_ireg_sized: virtual"),
        };
        s = s + suffix;
    }
    s
 }
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -0,0 +1,358 @@
 //! Lowering rules for X64.
 #![allow(dead_code)]
 #![allow(non_snake_case)]
 use regalloc::{Reg, Writable};
 use crate::ir::condcodes::IntCC;
 use crate::ir::types;
 use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
 use crate::isa::x64::X64Backend;
 /// Context passed to all lowering functions.
 type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>;
 //=============================================================================
 // Helpers for instruction lowering.
 fn is_int_ty(ty: Type) -> bool {
    match ty {
        types::I8 | types::I16 | types::I32 | types::I64 => true,
        _ => false,
    }
 }
 fn int_ty_to_is64(ty: Type) -> bool {
    match ty {
        types::I8 | types::I16 | types::I32 => false,
        types::I64 => true,
        _ => panic!("type {} is none of I8, I16, I32 or I64", ty),
    }
 }
 fn int_ty_to_sizeB(ty: Type) -> u8 {
    match ty {
        types::I8 => 1,
        types::I16 => 2,
        types::I32 => 4,
        types::I64 => 8,
        _ => panic!("ity_to_sizeB"),
    }
 }
 fn iri_to_u64_immediate<'a>(ctx: Ctx<'a>, iri: IRInst) -> Option<u64> {
    let inst_data = ctx.data(iri);
    if inst_data.opcode() == Opcode::Null {
        Some(0)
    } else {
        match inst_data {
            &InstructionData::UnaryImm { opcode: _, imm } => {
                // Only has Into for i64; we use u64 elsewhere, so we cast.
                let imm: i64 = imm.into();
                Some(imm as u64)
            }
            _ => None,
        }
    }
 }
 fn inst_condcode(data: &InstructionData) -> IntCC {
    match data {
        &InstructionData::IntCond { cond, .. }
        | &InstructionData::BranchIcmp { cond, .. }
        | &InstructionData::IntCompare { cond, .. }
        | &InstructionData::IntCondTrap { cond, .. }
        | &InstructionData::BranchInt { cond, .. }
        | &InstructionData::IntSelect { cond, .. }
        | &InstructionData::IntCompareImm { cond, .. } => cond,
        _ => panic!("inst_condcode(x64): unhandled: {:?}", data),
    }
 }
 fn intCC_to_x64_CC(cc: IntCC) -> CC {
    match cc {
        IntCC::Equal => CC::Z,
        IntCC::NotEqual => CC::NZ,
        IntCC::SignedGreaterThanOrEqual => CC::NL,
        IntCC::SignedGreaterThan => CC::NLE,
        IntCC::SignedLessThanOrEqual => CC::LE,
        IntCC::SignedLessThan => CC::L,
        IntCC::UnsignedGreaterThanOrEqual => CC::NB,
        IntCC::UnsignedGreaterThan => CC::NBE,
        IntCC::UnsignedLessThanOrEqual => CC::BE,
        IntCC::UnsignedLessThan => CC::B,
        IntCC::Overflow => CC::O,
        IntCC::NotOverflow => CC::NO,
    }
 }
 //=============================================================================
 // Top-level instruction lowering entry point, for one instruction.
 /// Actually codegen an instruction's results into registers.
 fn lower_insn_to_regs<'a>(ctx: Ctx<'a>, iri: IRInst) {
    let op = ctx.data(iri).opcode();
    let ty = if ctx.num_outputs(iri) == 1 {
        Some(ctx.output_ty(iri, 0))
    } else {
        None
    };
    // This is all outstandingly feeble.  TODO: much better!
    match op {
        Opcode::Iconst => {
            if let Some(w64) = iri_to_u64_immediate(ctx, iri) {
                // Get exactly the bit pattern in 'w64' into the dest.  No
                // monkeying with sign extension etc.
                let dstIs64 = w64 > 0xFFFF_FFFF;
                let regD = ctx.output(iri, 0);
                ctx.emit(Inst::imm_r(dstIs64, w64, regD));
            } else {
                unimplemented!();
            }
        }
        Opcode::Iadd | Opcode::Isub => {
            let regD = ctx.output(iri, 0);
            let regL = ctx.input(iri, 0);
            let regR = ctx.input(iri, 1);
            let is64 = int_ty_to_is64(ty.unwrap());
            let how = if op == Opcode::Iadd {
                RMI_R_Op::Add
            } else {
                RMI_R_Op::Sub
            };
            ctx.emit(Inst::mov_r_r(true, regL, regD));
            ctx.emit(Inst::alu_rmi_r(is64, how, RMI::reg(regR), regD));
        }
        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
            // TODO: implement imm shift value into insn
            let tySL = ctx.input_ty(iri, 0);
            let tyD = ctx.output_ty(iri, 0); // should be the same as tySL
            let regSL = ctx.input(iri, 0);
            let regSR = ctx.input(iri, 1);
            let regD = ctx.output(iri, 0);
            if tyD == tySL && (tyD == types::I32 || tyD == types::I64) {
                let how = match op {
                    Opcode::Ishl => ShiftKind::Left,
                    Opcode::Ushr => ShiftKind::RightZ,
                    Opcode::Sshr => ShiftKind::RightS,
                    _ => unreachable!(),
                };
                let is64 = tyD == types::I64;
                let r_rcx = regs::rcx();
                let w_rcx = Writable::<Reg>::from_reg(r_rcx);
                ctx.emit(Inst::mov_r_r(true, regSL, regD));
                ctx.emit(Inst::mov_r_r(true, regSR, w_rcx));
                ctx.emit(Inst::shift_r(is64, how, None /*%cl*/, regD));
            } else {
                unimplemented!()
            }
        }
        Opcode::Uextend | Opcode::Sextend => {
            // TODO: this is all extremely lame, all because Mov{ZX,SX}_M_R
            // don't accept a register source operand.  They should be changed
            // so as to have _RM_R form.
            // TODO2: if the source operand is a load, incorporate that.
            let isZX = op == Opcode::Uextend;
            let tyS = ctx.input_ty(iri, 0);
            let tyD = ctx.output_ty(iri, 0);
            let regS = ctx.input(iri, 0);
            let regD = ctx.output(iri, 0);
            ctx.emit(Inst::mov_r_r(true, regS, regD));
            match (tyS, tyD, isZX) {
                (types::I8, types::I64, false) => {
                    ctx.emit(Inst::shift_r(true, ShiftKind::Left, Some(56), regD));
                    ctx.emit(Inst::shift_r(true, ShiftKind::RightS, Some(56), regD));
                }
                _ => unimplemented!(),
            }
        }
        Opcode::FallthroughReturn | Opcode::Return => {
            for i in 0..ctx.num_inputs(iri) {
                let src_reg = ctx.input(iri, i);
                let retval_reg = ctx.retval(i);
                ctx.emit(Inst::mov_r_r(true, src_reg, retval_reg));
            }
            // N.B.: the Ret itself is generated by the ABI.
        }
        Opcode::IaddImm
        | Opcode::ImulImm
        | Opcode::UdivImm
        | Opcode::SdivImm
        | Opcode::UremImm
        | Opcode::SremImm
        | Opcode::IrsubImm
        | Opcode::IaddCin
        | Opcode::IaddIfcin
        | Opcode::IaddCout
        | Opcode::IaddIfcout
        | Opcode::IaddCarry
        | Opcode::IaddIfcarry
        | Opcode::IsubBin
        | Opcode::IsubIfbin
        | Opcode::IsubBout
        | Opcode::IsubIfbout
        | Opcode::IsubBorrow
        | Opcode::IsubIfborrow
        | Opcode::BandImm
        | Opcode::BorImm
        | Opcode::BxorImm
        | Opcode::RotlImm
        | Opcode::RotrImm
        | Opcode::IshlImm
        | Opcode::UshrImm
        | Opcode::SshrImm => {
            panic!("ALU+imm and ALU+carry ops should not appear here!");
        }
        Opcode::X86Udivmodx
        | Opcode::X86Sdivmodx
        | Opcode::X86Umulx
        | Opcode::X86Smulx
        | Opcode::X86Cvtt2si
        | Opcode::X86Fmin
        | Opcode::X86Fmax
        | Opcode::X86Push
        | Opcode::X86Pop
        | Opcode::X86Bsr
        | Opcode::X86Bsf
        | Opcode::X86Pshufd
        | Opcode::X86Pshufb
        | Opcode::X86Pextr
        | Opcode::X86Pinsr
        | Opcode::X86Insertps
        | Opcode::X86Movsd
        | Opcode::X86Movlhps
        | Opcode::X86Psll
        | Opcode::X86Psrl
        | Opcode::X86Psra
        | Opcode::X86Ptest
        | Opcode::X86Pmaxs
        | Opcode::X86Pmaxu
        | Opcode::X86Pmins
        | Opcode::X86Pminu => {
            panic!("x86-specific opcode in supposedly arch-neutral IR!");
        }
        _ => unimplemented!("unimplemented lowering for opcode {:?}", op),
    }
 }
 //=============================================================================
 // Lowering-backend trait implementation.
 impl LowerBackend for X64Backend {
    type MInst = Inst;
    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
        lower_insn_to_regs(ctx, ir_inst);
    }
    fn lower_branch_group<C: LowerCtx<I = Inst>>(
        &self,
        ctx: &mut C,
        branches: &[IRInst],
        targets: &[BlockIndex],
        fallthrough: Option<BlockIndex>,
    ) {
        // A block should end with at most two branches. The first may be a
        // conditional branch; a conditional branch can be followed only by an
        // unconditional branch or fallthrough. Otherwise, if only one branch,
        // it may be an unconditional branch, a fallthrough, a return, or a
        // trap. These conditions are verified by `is_ebb_basic()` during the
        // verifier pass.
        assert!(branches.len() <= 2);
        let mut unimplemented = false;
        if branches.len() == 2 {
            // Must be a conditional branch followed by an unconditional branch.
            let op0 = ctx.data(branches[0]).opcode();
            let op1 = ctx.data(branches[1]).opcode();
            println!(
                "QQQQ lowering two-branch group: opcodes are {:?} and {:?}",
                op0, op1
            );
            assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
            let taken = BranchTarget::Block(targets[0]);
            let not_taken = match op1 {
                Opcode::Jump => BranchTarget::Block(targets[1]),
                Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()),
                _ => unreachable!(), // assert above.
            };
            match op0 {
                Opcode::Brz | Opcode::Brnz => {
                    let tyS = ctx.input_ty(branches[0], 0);
                    if is_int_ty(tyS) {
                        let rS = ctx.input(branches[0], 0);
                        let cc = match op0 {
                            Opcode::Brz => CC::Z,
                            Opcode::Brnz => CC::NZ,
                            _ => unreachable!(),
                        };
                        let sizeB = int_ty_to_sizeB(tyS);
                        ctx.emit(Inst::cmp_rmi_r(sizeB, RMI::imm(0), rS));
                        ctx.emit(Inst::jmp_cond_symm(cc, taken, not_taken));
                    } else {
                        unimplemented = true;
                    }
                }
                Opcode::BrIcmp => {
                    let tyS = ctx.input_ty(branches[0], 0);
                    if is_int_ty(tyS) {
                        let rSL = ctx.input(branches[0], 0);
                        let rSR = ctx.input(branches[0], 1);
                        let cc = intCC_to_x64_CC(inst_condcode(ctx.data(branches[0])));
                        let sizeB = int_ty_to_sizeB(tyS);
                        // FIXME verify rSR vs rSL ordering
                        ctx.emit(Inst::cmp_rmi_r(sizeB, RMI::reg(rSR), rSL));
                        ctx.emit(Inst::jmp_cond_symm(cc, taken, not_taken));
                    } else {
                        unimplemented = true;
                    }
                }
                // TODO: Brif/icmp, Brff/icmp, jump tables
                _ => {
                    unimplemented = true;
                }
            }
        } else {
            assert!(branches.len() == 1);
            // Must be an unconditional branch or trap.
            let op = ctx.data(branches[0]).opcode();
            match op {
                Opcode::Jump => {
                    ctx.emit(Inst::jmp_known(BranchTarget::Block(targets[0])));
                }
                Opcode::Fallthrough => {
                    ctx.emit(Inst::jmp_known(BranchTarget::Block(targets[0])));
                }
                Opcode::Trap => {
                    unimplemented = true;
                }
                _ => panic!("Unknown branch type!"),
            }
        }
        if unimplemented {
            unimplemented!("lower_branch_group(x64): can't handle: {:?}", branches);
        }
    }
 }
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -0,0 +1,92 @@
 //! X86_64-bit Instruction Set Architecture.
 use alloc::boxed::Box;
 use regalloc::RealRegUniverse;
 use target_lexicon::Triple;
 use crate::ir::Function;
 use crate::isa::Builder as IsaBuilder;
 use crate::machinst::pretty_print::ShowWithRRU;
 use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
 use crate::result::CodegenResult;
 use crate::settings::{self, Flags};
 use crate::isa::x64::inst::regs::create_reg_universe_systemv;
 mod abi;
 mod inst;
 mod lower;
 /// An X64 backend.
 pub(crate) struct X64Backend {
    triple: Triple,
    flags: Flags,
 }
 impl X64Backend {
    /// Create a new X64 backend with the given (shared) flags.
    fn new_with_flags(triple: Triple, flags: Flags) -> Self {
        Self { triple, flags }
    }
    fn compile_vcode(&self, func: &Function, flags: Flags) -> CodegenResult<VCode<inst::Inst>> {
        // This performs lowering to VCode, register-allocates the code, computes
        // block layout and finalizes branches. The result is ready for binary emission.
        let abi = Box::new(abi::X64ABIBody::new(&func, flags));
        compile::compile::<Self>(&func, self, abi)
    }
 }
 impl MachBackend for X64Backend {
    fn compile_function(
        &self,
        func: &Function,
        want_disasm: bool,
    ) -> CodegenResult<MachCompileResult> {
        let flags = self.flags();
        let vcode = self.compile_vcode(func, flags.clone())?;
        let sections = vcode.emit();
        let frame_size = vcode.frame_size();
        let disasm = if want_disasm {
            Some(vcode.show_rru(Some(&create_reg_universe_systemv(flags))))
        } else {
            None
        };
        Ok(MachCompileResult {
            sections,
            frame_size,
            disasm,
        })
    }
    fn flags(&self) -> &Flags {
        &self.flags
    }
    fn name(&self) -> &'static str {
        "x64"
    }
    fn triple(&self) -> Triple {
        self.triple.clone()
    }
    fn reg_universe(&self) -> RealRegUniverse {
        create_reg_universe_systemv(&self.flags)
    }
 }
 /// Create a new `isa::Builder`.
 pub(crate) fn isa_builder(triple: Triple) -> IsaBuilder {
    IsaBuilder {
        triple,
        setup: settings::builder(),
        constructor: |triple: Triple, flags: Flags, _arch_flag_builder: settings::Builder| {
            let backend = X64Backend::new_with_flags(triple, flags);
            Box::new(TargetIsaAdapter::new(backend))
        },
    }
 }
--- a/cranelift/codegen/src/isa/x86/mod.rs
+++ b/cranelift/codegen/src/isa/x86/mod.rs
@@ -53,12 +53,23 @@ fn isa_constructor(
        PointerWidth::U32 => &enc_tables::LEVEL1_I32[..],
        PointerWidth::U64 => &enc_tables::LEVEL1_I64[..],
    };
-    Box::new(Isa {
+
-        triple,
+    let isa_flags = settings::Flags::new(&shared_flags, builder);
-        isa_flags: settings::Flags::new(&shared_flags, builder),
+
-        shared_flags,
+    if isa_flags.use_new_backend() {
-        cpumode: level1,
+        #[cfg(not(feature = "x64"))]
-    })
+        panic!("new backend x86 support not included by cargo features!");
        #[cfg(feature = "x64")]
        super::x64::isa_builder(triple).finish(shared_flags)
    } else {
        Box::new(Isa {
            triple,
            isa_flags,
            shared_flags,
            cpumode: level1,
        })
    }
 }
 impl TargetIsa for Isa {