Add a work-in-progress backend for x86_64 using the new instruction selection;

Most of the work is credited to Julian Seward. Co-authored-by: Julian Seward <jseward@acm.org> Co-authored-by: Chris Fallin <cfallin@mozilla.com>
2020-04-27 16:19:08 +02:00
parent 6bee767129
commit fa54422854
12 changed files with 5690 additions and 6 deletions
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -0,0 +1,888 @@
+use regalloc::{Reg, RegClass};
+
+use crate::isa::x64::inst::*;
+
+fn low8willSXto64(x: u32) -> bool {
+    let xs = (x as i32) as i64;
+    xs == ((xs << 56) >> 56)
+}
+
+fn low8willSXto32(x: u32) -> bool {
+    let xs = x as i32;
+    xs == ((xs << 24) >> 24)
+}
+
+//=============================================================================
+// Instructions and subcomponents: emission
+
+// For all of the routines that take both a memory-or-reg operand (sometimes
+// called "E" in the Intel documentation) and a reg-only operand ("G" in
+// Intelese), the order is always G first, then E.
+//
+// "enc" in the following means "hardware register encoding number".
+
+#[inline(always)]
+fn mkModRegRM(m0d: u8, encRegG: u8, rmE: u8) -> u8 {
+    debug_assert!(m0d < 4);
+    debug_assert!(encRegG < 8);
+    debug_assert!(rmE < 8);
+    ((m0d & 3) << 6) | ((encRegG & 7) << 3) | (rmE & 7)
+}
+
+#[inline(always)]
+fn mkSIB(shift: u8, encIndex: u8, encBase: u8) -> u8 {
+    debug_assert!(shift < 4);
+    debug_assert!(encIndex < 8);
+    debug_assert!(encBase < 8);
+    ((shift & 3) << 6) | ((encIndex & 7) << 3) | (encBase & 7)
+}
+
+/// Get the encoding number from something which we sincerely hope is a real
+/// register of class I64.
+#[inline(always)]
+fn iregEnc(reg: Reg) -> u8 {
+    debug_assert!(reg.is_real());
+    debug_assert!(reg.get_class() == RegClass::I64);
+    reg.get_hw_encoding()
+}
+
+// F_*: these flags describe special handling of the insn to be generated.  Be
+// careful with these.  It is easy to create nonsensical combinations.
+const F_NONE: u32 = 0;
+
+/// Emit the REX prefix byte even if it appears to be redundant (== 0x40).
+const F_RETAIN_REDUNDANT_REX: u32 = 1;
+
+/// Set the W bit in the REX prefix to zero.  By default it will be set to 1,
+/// indicating a 64-bit operation.
+const F_CLEAR_REX_W: u32 = 2;
+
+/// Add an 0x66 (operand-size override) prefix.  This is necessary to indicate
+/// a 16-bit operation.  Normally this will be used together with F_CLEAR_REX_W.
+const F_PREFIX_66: u32 = 4;
+
+/// This is the core 'emit' function for instructions that reference memory.
+///
+/// For an instruction that has as operands a register `encG` and a memory
+/// address `memE`, create and emit, first the REX prefix, then caller-supplied
+/// opcode byte(s) (`opcodes` and `numOpcodes`), then the MOD/RM byte, then
+/// optionally, a SIB byte, and finally optionally an immediate that will be
+/// derived from the `memE` operand.  For most instructions up to and including
+/// SSE4.2, that will be the whole instruction.
+///
+/// The opcodes are written bigendianly for the convenience of callers.  For
+/// example, if the opcode bytes to be emitted are, in this order, F3 0F 27,
+/// then the caller should pass `opcodes` == 0xF3_0F_27 and `numOpcodes` == 3.
+///
+/// The register operand is represented here not as a `Reg` but as its hardware
+/// encoding, `encG`.  `flags` can specify special handling for the REX prefix.
+/// By default, the REX prefix will indicate a 64-bit operation and will be
+/// deleted if it is redundant (0x40).  Note that for a 64-bit operation, the
+/// REX prefix will normally never be redundant, since REX.W must be 1 to
+/// indicate a 64-bit operation.
+fn emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE<O: MachSectionOutput>(
+    sink: &mut O,
+    opcodes: u32,
+    mut numOpcodes: usize,
+    encG: u8,
+    memE: &Addr,
+    flags: u32,
+) {
+    // General comment for this function: the registers in `memE` must be
+    // 64-bit integer registers, because they are part of an address
+    // expression.  But `encG` can be derived from a register of any class.
+    let prefix66 = (flags & F_PREFIX_66) != 0;
+    let clearRexW = (flags & F_CLEAR_REX_W) != 0;
+    let retainRedundant = (flags & F_RETAIN_REDUNDANT_REX) != 0;
+    // The operand-size override, if requested.  This indicates a 16-bit
+    // operation.
+    if prefix66 {
+        sink.put1(0x66);
+    }
+    match memE {
+        Addr::IR { simm32, base: regE } => {
+            // First, cook up the REX byte.  This is easy.
+            let encE = iregEnc(*regE);
+            let w = if clearRexW { 0 } else { 1 };
+            let r = (encG >> 3) & 1;
+            let x = 0;
+            let b = (encE >> 3) & 1;
+            let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+            if rex != 0x40 || retainRedundant {
+                sink.put1(rex);
+            }
+            // Now the opcode(s).  These include any other prefixes the caller
+            // hands to us.
+            while numOpcodes > 0 {
+                numOpcodes -= 1;
+                sink.put1(((opcodes >> (numOpcodes << 3)) & 0xFF) as u8);
+            }
+            // Now the mod/rm and associated immediates.  This is
+            // significantly complicated due to the multiple special cases.
+            if *simm32 == 0
+                && encE != regs::ENC_RSP
+                && encE != regs::ENC_RBP
+                && encE != regs::ENC_R12
+                && encE != regs::ENC_R13
+            {
+                // FIXME JRS 2020Feb11: those four tests can surely be
+                // replaced by a single mask-and-compare check.  We should do
+                // that because this routine is likely to be hot.
+                sink.put1(mkModRegRM(0, encG & 7, encE & 7));
+            } else if *simm32 == 0 && (encE == regs::ENC_RSP || encE == regs::ENC_R12) {
+                sink.put1(mkModRegRM(0, encG & 7, 4));
+                sink.put1(0x24);
+            } else if low8willSXto32(*simm32) && encE != regs::ENC_RSP && encE != regs::ENC_R12 {
+                sink.put1(mkModRegRM(1, encG & 7, encE & 7));
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if encE != regs::ENC_RSP && encE != regs::ENC_R12 {
+                sink.put1(mkModRegRM(2, encG & 7, encE & 7));
+                sink.put4(*simm32);
+            } else if (encE == regs::ENC_RSP || encE == regs::ENC_R12) && low8willSXto32(*simm32) {
+                // REX.B distinguishes RSP from R12
+                sink.put1(mkModRegRM(1, encG & 7, 4));
+                sink.put1(0x24);
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if encE == regs::ENC_R12 || encE == regs::ENC_RSP {
+                //.. wait for test case for RSP case
+                // REX.B distinguishes RSP from R12
+                sink.put1(mkModRegRM(2, encG & 7, 4));
+                sink.put1(0x24);
+                sink.put4(*simm32);
+            } else {
+                unreachable!("emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE: IR");
+            }
+        }
+        // Bizarrely, the IRRS case is much simpler.
+        Addr::IRRS {
+            simm32,
+            base: regBase,
+            index: regIndex,
+            shift,
+        } => {
+            let encBase = iregEnc(*regBase);
+            let encIndex = iregEnc(*regIndex);
+            // The rex byte
+            let w = if clearRexW { 0 } else { 1 };
+            let r = (encG >> 3) & 1;
+            let x = (encIndex >> 3) & 1;
+            let b = (encBase >> 3) & 1;
+            let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+            if rex != 0x40 || retainRedundant {
+                sink.put1(rex);
+            }
+            // All other prefixes and opcodes
+            while numOpcodes > 0 {
+                numOpcodes -= 1;
+                sink.put1(((opcodes >> (numOpcodes << 3)) & 0xFF) as u8);
+            }
+            // modrm, SIB, immediates
+            if low8willSXto32(*simm32) && encIndex != regs::ENC_RSP {
+                sink.put1(mkModRegRM(1, encG & 7, 4));
+                sink.put1(mkSIB(*shift, encIndex & 7, encBase & 7));
+                sink.put1(*simm32 as u8);
+            } else if encIndex != regs::ENC_RSP {
+                sink.put1(mkModRegRM(2, encG & 7, 4));
+                sink.put1(mkSIB(*shift, encIndex & 7, encBase & 7));
+                sink.put4(*simm32);
+            } else {
+                panic!("emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE: IRRS");
+            }
+        }
+    }
+}
+
+/// This is the core 'emit' function for instructions that do not reference
+/// memory.
+///
+/// This is conceptually the same as
+/// emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE, except it is for the case
+/// where the E operand is a register rather than memory.  Hence it is much
+/// simpler.
+fn emit_REX_OPCODES_MODRM_encG_encE<O: MachSectionOutput>(
+    sink: &mut O,
+    opcodes: u32,
+    mut numOpcodes: usize,
+    encG: u8,
+    encE: u8,
+    flags: u32,
+) {
+    // EncG and EncE can be derived from registers of any class, and they
+    // don't even have to be from the same class.  For example, for an
+    // integer-to-FP conversion insn, one might be RegClass::I64 and the other
+    // RegClass::V128.
+    let prefix66 = (flags & F_PREFIX_66) != 0;
+    let clearRexW = (flags & F_CLEAR_REX_W) != 0;
+    let retainRedundant = (flags & F_RETAIN_REDUNDANT_REX) != 0;
+    // The operand-size override
+    if prefix66 {
+        sink.put1(0x66);
+    }
+    // The rex byte
+    let w = if clearRexW { 0 } else { 1 };
+    let r = (encG >> 3) & 1;
+    let x = 0;
+    let b = (encE >> 3) & 1;
+    let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+    if rex != 0x40 || retainRedundant {
+        sink.put1(rex);
+    }
+    // All other prefixes and opcodes
+    while numOpcodes > 0 {
+        numOpcodes -= 1;
+        sink.put1(((opcodes >> (numOpcodes << 3)) & 0xFF) as u8);
+    }
+    // Now the mod/rm byte.  The instruction we're generating doesn't access
+    // memory, so there is no SIB byte or immediate -- we're done.
+    sink.put1(mkModRegRM(3, encG & 7, encE & 7));
+}
+
+// These are merely wrappers for the above two functions that facilitate passing
+// actual `Reg`s rather than their encodings.
+
+fn emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE<O: MachSectionOutput>(
+    sink: &mut O,
+    opcodes: u32,
+    numOpcodes: usize,
+    regG: Reg,
+    memE: &Addr,
+    flags: u32,
+) {
+    // JRS FIXME 2020Feb07: this should really just be `regEnc` not `iregEnc`
+    let encG = iregEnc(regG);
+    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(sink, opcodes, numOpcodes, encG, memE, flags);
+}
+
+fn emit_REX_OPCODES_MODRM_regG_regE<O: MachSectionOutput>(
+    sink: &mut O,
+    opcodes: u32,
+    numOpcodes: usize,
+    regG: Reg,
+    regE: Reg,
+    flags: u32,
+) {
+    // JRS FIXME 2020Feb07: these should really just be `regEnc` not `iregEnc`
+    let encG = iregEnc(regG);
+    let encE = iregEnc(regE);
+    emit_REX_OPCODES_MODRM_encG_encE(sink, opcodes, numOpcodes, encG, encE, flags);
+}
+
+/// Write a suitable number of bits from an imm64 to the sink.
+fn emit_simm<O: MachSectionOutput>(sink: &mut O, size: u8, simm32: u32) {
+    match size {
+        8 | 4 => sink.put4(simm32),
+        2 => sink.put2(simm32 as u16),
+        1 => sink.put1(simm32 as u8),
+        _ => panic!("x64::Inst::emit_simm: unreachable"),
+    }
+}
+
+/// The top-level emit function.
+///
+/// Important!  Do not add improved (shortened) encoding cases to existing
+/// instructions without also adding tests for those improved encodings.  That
+/// is a dangerous game that leads to hard-to-track-down errors in the emitted
+/// code.
+///
+/// For all instructions, make sure to have test coverage for all of the
+/// following situations.  Do this by creating the cross product resulting from
+/// applying the following rules to each operand:
+///
+/// (1) for any insn that mentions a register: one test using a register from
+///     the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
+///     using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
+///     This helps detect incorrect REX prefix construction.
+///
+/// (2) for any insn that mentions a byte register: one test for each of the
+///     four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
+///     [r8b .. r11b] and [r12b .. r15b].  This checks that
+///     apparently-redundant REX prefixes are retained when required.
+///
+/// (3) for any insn that contains an immediate field, check the following
+///     cases: field is zero, field is in simm8 range (-128 .. 127), field is
+///     in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF).  This is because some
+///     instructions that require a 32-bit immediate have a short-form encoding
+///     when the imm is in simm8 range.
+///
+/// Rules (1), (2) and (3) don't apply for registers within address expressions
+/// (`Addr`s).  Those are already pretty well tested, and the registers in them
+/// don't have any effect on the containing instruction (apart from possibly
+/// require REX prefix bits).
+///
+/// When choosing registers for a test, avoid using registers with the same
+/// offset within a given group.  For example, don't use rax and r8, since they
+/// both have the lowest 3 bits as 000, and so the test won't detect errors
+/// where those 3-bit register sub-fields are confused by the emitter.  Instead
+/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001).  Similarly, don't use (eg) cl
+/// and bpl since they have the same offset in their group; use instead (eg) cl
+/// and sil.
+///
+/// For all instructions, also add a test that uses only low-half registers
+/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
+/// prefixes are correctly omitted.  This low-half restriction must apply to
+/// _all_ registers in the insn, even those in address expressions.
+///
+/// Following these rules creates large numbers of test cases, but it's the
+/// only way to make the emitter reliable.
+///
+/// Known possible improvements:
+///
+/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
+///   care?)
+pub(crate) fn emit<O: MachSectionOutput>(inst: &Inst, sink: &mut O) {
+    match inst {
+        Inst::Nop { len: 0 } => {}
+        Inst::Alu_RMI_R {
+            is_64,
+            op,
+            src: srcE,
+            dst: regG,
+        } => {
+            let flags = if *is_64 { F_NONE } else { F_CLEAR_REX_W };
+            if *op == RMI_R_Op::Mul {
+                // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so
+                // we have to special-case it.
+                match srcE {
+                    RMI::R { reg: regE } => {
+                        emit_REX_OPCODES_MODRM_regG_regE(
+                            sink,
+                            0x0FAF,
+                            2,
+                            regG.to_reg(),
+                            *regE,
+                            flags,
+                        );
+                    }
+                    RMI::M { addr } => {
+                        emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                            sink,
+                            0x0FAF,
+                            2,
+                            regG.to_reg(),
+                            addr,
+                            flags,
+                        );
+                    }
+                    RMI::I { simm32 } => {
+                        let useImm8 = low8willSXto32(*simm32);
+                        let opcode = if useImm8 { 0x6B } else { 0x69 };
+                        // Yes, really, regG twice.
+                        emit_REX_OPCODES_MODRM_regG_regE(
+                            sink,
+                            opcode,
+                            1,
+                            regG.to_reg(),
+                            regG.to_reg(),
+                            flags,
+                        );
+                        emit_simm(sink, if useImm8 { 1 } else { 4 }, *simm32);
+                    }
+                }
+            } else {
+                let (opcode_R, opcode_M, subopcode_I) = match op {
+                    RMI_R_Op::Add => (0x01, 0x03, 0),
+                    RMI_R_Op::Sub => (0x29, 0x2B, 5),
+                    RMI_R_Op::And => (0x21, 0x23, 4),
+                    RMI_R_Op::Or => (0x09, 0x0B, 1),
+                    RMI_R_Op::Xor => (0x31, 0x33, 6),
+                    RMI_R_Op::Mul => panic!("unreachable"),
+                };
+                match srcE {
+                    RMI::R { reg: regE } => {
+                        // Note.  The arguments .. regE .. regG .. sequence
+                        // here is the opposite of what is expected.  I'm not
+                        // sure why this is.  But I am fairly sure that the
+                        // arg order could be switched back to the expected
+                        // .. regG .. regE .. if opcode_rr is also switched
+                        // over to the "other" basic integer opcode (viz, the
+                        // R/RM vs RM/R duality).  However, that would mean
+                        // that the test results won't be in accordance with
+                        // the GNU as reference output.  In other words, the
+                        // inversion exists as a result of using GNU as as a
+                        // gold standard.
+                        emit_REX_OPCODES_MODRM_regG_regE(
+                            sink,
+                            opcode_R,
+                            1,
+                            *regE,
+                            regG.to_reg(),
+                            flags,
+                        );
+                        // NB: if this is ever extended to handle byte size
+                        // ops, be sure to retain redundant REX prefixes.
+                    }
+                    RMI::M { addr } => {
+                        // Whereas here we revert to the "normal" G-E ordering.
+                        emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                            sink,
+                            opcode_M,
+                            1,
+                            regG.to_reg(),
+                            addr,
+                            flags,
+                        );
+                    }
+                    RMI::I { simm32 } => {
+                        let useImm8 = low8willSXto32(*simm32);
+                        let opcode = if useImm8 { 0x83 } else { 0x81 };
+                        // And also here we use the "normal" G-E ordering.
+                        let encG = iregEnc(regG.to_reg());
+                        emit_REX_OPCODES_MODRM_encG_encE(sink, opcode, 1, subopcode_I, encG, flags);
+                        emit_simm(sink, if useImm8 { 1 } else { 4 }, *simm32);
+                    }
+                }
+            }
+        }
+        Inst::Imm_R {
+            dst_is_64,
+            simm64,
+            dst,
+        } => {
+            let encDst = iregEnc(dst.to_reg());
+            if *dst_is_64 {
+                // FIXME JRS 2020Feb10: also use the 32-bit case here when
+                // possible
+                sink.put1(0x48 | ((encDst >> 3) & 1));
+                sink.put1(0xB8 | (encDst & 7));
+                sink.put8(*simm64);
+            } else {
+                if ((encDst >> 3) & 1) == 1 {
+                    sink.put1(0x41);
+                }
+                sink.put1(0xB8 | (encDst & 7));
+                sink.put4(*simm64 as u32);
+            }
+        }
+        Inst::Mov_R_R { is_64, src, dst } => {
+            let flags = if *is_64 { F_NONE } else { F_CLEAR_REX_W };
+            emit_REX_OPCODES_MODRM_regG_regE(sink, 0x89, 1, *src, dst.to_reg(), flags);
+        }
+        Inst::MovZX_M_R { extMode, addr, dst } => {
+            match extMode {
+                ExtMode::BL => {
+                    // MOVZBL is (REX.W==0) 0F B6 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FB6,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                ExtMode::BQ => {
+                    // MOVZBQ is (REX.W==1) 0F B6 /r
+                    // I'm not sure why the Intel manual offers different
+                    // encodings for MOVZBQ than for MOVZBL.  AIUI they should
+                    // achieve the same, since MOVZBL is just going to zero out
+                    // the upper half of the destination anyway.
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FB6,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+                ExtMode::WL => {
+                    // MOVZWL is (REX.W==0) 0F B7 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FB7,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                ExtMode::WQ => {
+                    // MOVZWQ is (REX.W==1) 0F B7 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FB7,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+                ExtMode::LQ => {
+                    // This is just a standard 32 bit load, and we rely on the
+                    // default zero-extension rule to perform the extension.
+                    // MOV r/m32, r32 is (REX.W==0) 8B /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x8B,
+                        1,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+            }
+        }
+        Inst::Mov64_M_R { addr, dst } => {
+            emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(sink, 0x8B, 1, dst.to_reg(), addr, F_NONE)
+        }
+        Inst::MovSX_M_R { extMode, addr, dst } => {
+            match extMode {
+                ExtMode::BL => {
+                    // MOVSBL is (REX.W==0) 0F BE /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FBE,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                ExtMode::BQ => {
+                    // MOVSBQ is (REX.W==1) 0F BE /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FBE,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+                ExtMode::WL => {
+                    // MOVSWL is (REX.W==0) 0F BF /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FBF,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                ExtMode::WQ => {
+                    // MOVSWQ is (REX.W==1) 0F BF /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x0FBF,
+                        2,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+                ExtMode::LQ => {
+                    // MOVSLQ is (REX.W==1) 63 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x63,
+                        1,
+                        dst.to_reg(),
+                        addr,
+                        F_NONE,
+                    )
+                }
+            }
+        }
+        Inst::Mov_R_M { size, src, addr } => {
+            match size {
+                1 => {
+                    // This is one of the few places where the presence of a
+                    // redundant REX prefix changes the meaning of the
+                    // instruction.
+                    let encSrc = iregEnc(*src);
+                    let retainRedundantRex = if encSrc >= 4 && encSrc <= 7 {
+                        F_RETAIN_REDUNDANT_REX
+                    } else {
+                        0
+                    };
+                    // MOV r8, r/m8 is (REX.W==0) 88 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x88,
+                        1,
+                        *src,
+                        addr,
+                        F_CLEAR_REX_W | retainRedundantRex,
+                    )
+                }
+                2 => {
+                    // MOV r16, r/m16 is 66 (REX.W==0) 89 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x89,
+                        1,
+                        *src,
+                        addr,
+                        F_CLEAR_REX_W | F_PREFIX_66,
+                    )
+                }
+                4 => {
+                    // MOV r32, r/m32 is (REX.W==0) 89 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(
+                        sink,
+                        0x89,
+                        1,
+                        *src,
+                        addr,
+                        F_CLEAR_REX_W,
+                    )
+                }
+                8 => {
+                    // MOV r64, r/m64 is (REX.W==1) 89 /r
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(sink, 0x89, 1, *src, addr, F_NONE)
+                }
+                _ => panic!("x64::Inst::Mov_R_M::emit: unreachable"),
+            }
+        }
+        Inst::Shift_R {
+            is_64,
+            kind,
+            num_bits,
+            dst,
+        } => {
+            let encDst = iregEnc(dst.to_reg());
+            let subopcode = match kind {
+                ShiftKind::Left => 4,
+                ShiftKind::RightZ => 5,
+                ShiftKind::RightS => 7,
+            };
+            match num_bits {
+                None => {
+                    // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode
+                    // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink,
+                        0xD3,
+                        1,
+                        subopcode,
+                        encDst,
+                        if *is_64 { F_NONE } else { F_CLEAR_REX_W },
+                    );
+                }
+                Some(num_bits) => {
+                    // SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib
+                    // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib
+                    // When the shift amount is 1, there's an even shorter encoding, but we don't
+                    // bother with that nicety here.
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink,
+                        0xC1,
+                        1,
+                        subopcode,
+                        encDst,
+                        if *is_64 { F_NONE } else { F_CLEAR_REX_W },
+                    );
+                    sink.put1(*num_bits);
+                }
+            }
+        }
+        Inst::Cmp_RMI_R {
+            size,
+            src: srcE,
+            dst: regG,
+        } => {
+            let mut retainRedundantRex = 0;
+            if *size == 1 {
+                // Here, a redundant REX prefix changes the meaning of the
+                // instruction.
+                let encG = iregEnc(*regG);
+                if encG >= 4 && encG <= 7 {
+                    retainRedundantRex = F_RETAIN_REDUNDANT_REX;
+                }
+            }
+            let mut flags = match size {
+                8 => F_NONE,
+                4 => F_CLEAR_REX_W,
+                2 => F_CLEAR_REX_W | F_PREFIX_66,
+                1 => F_CLEAR_REX_W | retainRedundantRex,
+                _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"),
+            };
+            match srcE {
+                RMI::R { reg: regE } => {
+                    let opcode = if *size == 1 { 0x38 } else { 0x39 };
+                    if *size == 1 {
+                        // We also need to check whether the E register forces
+                        // the use of a redundant REX.
+                        let encE = iregEnc(*regE);
+                        if encE >= 4 && encE <= 7 {
+                            flags |= F_RETAIN_REDUNDANT_REX;
+                        }
+                    }
+                    // Same comment re swapped args as for Alu_RMI_R.
+                    emit_REX_OPCODES_MODRM_regG_regE(sink, opcode, 1, *regE, *regG, flags);
+                }
+                RMI::M { addr } => {
+                    let opcode = if *size == 1 { 0x3A } else { 0x3B };
+                    // Whereas here we revert to the "normal" G-E ordering.
+                    emit_REX_OPCODES_MODRM_SIB_IMM_regG_memE(sink, opcode, 1, *regG, addr, flags);
+                }
+                RMI::I { simm32 } => {
+                    // FIXME JRS 2020Feb11: there are shorter encodings for
+                    // cmp $imm, rax/eax/ax/al.
+                    let useImm8 = low8willSXto32(*simm32);
+                    let opcode = if *size == 1 {
+                        0x80
+                    } else if useImm8 {
+                        0x83
+                    } else {
+                        0x81
+                    };
+                    // And also here we use the "normal" G-E ordering.
+                    let encG = iregEnc(*regG);
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink, opcode, 1, 7, /*subopcode*/
+                        encG, flags,
+                    );
+                    emit_simm(sink, if useImm8 { 1 } else { *size }, *simm32);
+                }
+            }
+        }
+        Inst::Push64 { src } => {
+            match src {
+                RMI::R { reg } => {
+                    let encReg = iregEnc(*reg);
+                    let rex = 0x40 | ((encReg >> 3) & 1);
+                    if rex != 0x40 {
+                        sink.put1(rex);
+                    }
+                    sink.put1(0x50 | (encReg & 7));
+                }
+                RMI::M { addr } => {
+                    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
+                        sink,
+                        0xFF,
+                        1,
+                        6, /*subopcode*/
+                        addr,
+                        F_CLEAR_REX_W,
+                    );
+                }
+                RMI::I { simm32 } => {
+                    if low8willSXto64(*simm32) {
+                        sink.put1(0x6A);
+                        sink.put1(*simm32 as u8);
+                    } else {
+                        sink.put1(0x68);
+                        sink.put4(*simm32);
+                    }
+                }
+            }
+        }
+        Inst::Pop64 { dst } => {
+            let encDst = iregEnc(dst.to_reg());
+            if encDst >= 8 {
+                // 0x41 == REX.{W=0, B=1}.  It seems that REX.W is irrelevant
+                // here.
+                sink.put1(0x41);
+            }
+            sink.put1(0x58 + (encDst & 7));
+        }
+        //
+        // ** Inst::CallKnown
+        //
+        Inst::CallUnknown { dest } => {
+            match dest {
+                RM::R { reg } => {
+                    let regEnc = iregEnc(*reg);
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink,
+                        0xFF,
+                        1,
+                        2, /*subopcode*/
+                        regEnc,
+                        F_CLEAR_REX_W,
+                    );
+                }
+                RM::M { addr } => {
+                    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
+                        sink,
+                        0xFF,
+                        1,
+                        2, /*subopcode*/
+                        addr,
+                        F_CLEAR_REX_W,
+                    );
+                }
+            }
+        }
+        Inst::Ret {} => sink.put1(0xC3),
+
+        Inst::JmpKnown {
+            dest: BranchTarget::Block(..),
+        } => {
+            // Computation of block offsets/sizes.
+            sink.put1(0);
+            sink.put4(0);
+        }
+        Inst::JmpKnown {
+            dest: BranchTarget::ResolvedOffset(_bix, offset),
+        } if *offset >= -0x7FFF_FF00 && *offset <= 0x7FFF_FF00 => {
+            // And now for real
+            let mut offs_i32 = *offset as i32;
+            offs_i32 -= 5;
+            let offs_u32 = offs_i32 as u32;
+            sink.put1(0xE9);
+            sink.put4(offs_u32);
+        }
+        //
+        // ** Inst::JmpCondSymm   XXXX should never happen
+        //
+        Inst::JmpCond {
+            cc: _,
+            target: BranchTarget::Block(..),
+        } => {
+            // This case occurs when we are computing block offsets / sizes,
+            // prior to lowering block-index targets to concrete-offset targets.
+            // Only the size matters, so let's emit 6 bytes, as below.
+            sink.put1(0);
+            sink.put1(0);
+            sink.put4(0);
+        }
+        Inst::JmpCond {
+            cc,
+            target: BranchTarget::ResolvedOffset(_bix, offset),
+        } if *offset >= -0x7FFF_FF00 && *offset <= 0x7FFF_FF00 => {
+            // This insn is 6 bytes long.  Currently `offset` is relative to
+            // the start of this insn, but the Intel encoding requires it to
+            // be relative to the start of the next instruction.  Hence the
+            // adjustment.
+            let mut offs_i32 = *offset as i32;
+            offs_i32 -= 6;
+            let offs_u32 = offs_i32 as u32;
+            sink.put1(0x0F);
+            sink.put1(0x80 + cc.get_enc());
+            sink.put4(offs_u32);
+        }
+        //
+        // ** Inst::JmpCondCompound   XXXX should never happen
+        //
+        Inst::JmpUnknown { target } => {
+            match target {
+                RM::R { reg } => {
+                    let regEnc = iregEnc(*reg);
+                    emit_REX_OPCODES_MODRM_encG_encE(
+                        sink,
+                        0xFF,
+                        1,
+                        4, /*subopcode*/
+                        regEnc,
+                        F_CLEAR_REX_W,
+                    );
+                }
+                RM::M { addr } => {
+                    emit_REX_OPCODES_MODRM_SIB_IMM_encG_memE(
+                        sink,
+                        0xFF,
+                        1,
+                        4, /*subopcode*/
+                        addr,
+                        F_CLEAR_REX_W,
+                    );
+                }
+            }
+        }
+
+        _ => panic!("x64_emit: unhandled: {} ", inst.show_rru(None)),
+    }
+}