x64: Clarify and shrink up ModRM/SIB encoding (#6181)

I noticed recently that for the `ImmRegRegShift` addressing mode Cranelift will unconditionally emit at least a 1-byte immediate for the offset to be added to the register addition computation, even when the offset is zero. In this case though the instruction encoding can be slightly more compact and remove a byte. This commit started off by applying this optimization, which resulted in the `*.clif` test changes in this commit. Further reading this code, however, I personally found it quite hard to follow what was happening with all the various branches and ModRM/SIB bits. I reviewed these encodings in the x64 architecture manual and attempted to improve the logic for encoding here. The new version in this commit is intended to be functionally equivalent to the prior version where dropping a zero-offset from the `ImmRegRegShift` variant is the only change.
2023-04-10 14:37:19 -05:00
parent 8f1a7773a3
commit 435b6894d7
4 changed files with 119 additions and 79 deletions
--- a/cranelift/codegen/src/isa/x64/encoding/rex.rs
+++ b/cranelift/codegen/src/isa/x64/encoding/rex.rs
@@ -356,46 +356,30 @@ pub(crate) fn emit_modrm_sib_disp(
    match *mem_e {
        Amode::ImmReg { simm32, base, .. } => {
            let enc_e = int_reg_enc(base);
            let mut imm = Imm::new(simm32);
-            // Now the mod/rm and associated immediates.  This is
+            // Most base registers allow for a single ModRM byte plus an
-            // significantly complicated due to the multiple special cases.
+            // optional immediate. If rsp is the base register, however, then a
-            if simm32 == 0
+            // SIB byte must be used.
-                && enc_e != regs::ENC_RSP
+            let enc_e_low3 = enc_e & 7;
-                && enc_e != regs::ENC_RBP
+            if enc_e_low3 != regs::ENC_RSP {
-                && enc_e != regs::ENC_R12
+                // If the base register is rbp and there's no offset then force
-                && enc_e != regs::ENC_R13
+                // a 1-byte zero offset since otherwise the encoding would be
-            {
+                // invalid.
-                // FIXME JRS 2020Feb11: those four tests can surely be
+                if enc_e_low3 == regs::ENC_RBP {
-                // replaced by a single mask-and-compare check.  We should do
+                    imm.force_immediate();
-                // that because this routine is likely to be hot.
+                }
-                sink.put1(encode_modrm(0, enc_g & 7, enc_e & 7));
+                sink.put1(encode_modrm(imm.m0d(), enc_g & 7, enc_e & 7));
-            } else if simm32 == 0 && (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) {
+                imm.emit(sink);
                sink.put1(encode_modrm(0, enc_g & 7, 4));
                sink.put1(0x24);
            } else if low8_will_sign_extend_to_32(simm32)
                && enc_e != regs::ENC_RSP
                && enc_e != regs::ENC_R12
            {
                sink.put1(encode_modrm(1, enc_g & 7, enc_e & 7));
                sink.put1((simm32 & 0xFF) as u8);
            } else if enc_e != regs::ENC_RSP && enc_e != regs::ENC_R12 {
                sink.put1(encode_modrm(2, enc_g & 7, enc_e & 7));
                sink.put4(simm32);
            } else if (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12)
                && low8_will_sign_extend_to_32(simm32)
            {
                // REX.B distinguishes RSP from R12
                sink.put1(encode_modrm(1, enc_g & 7, 4));
                sink.put1(0x24);
                sink.put1((simm32 & 0xFF) as u8);
            } else if enc_e == regs::ENC_R12 || enc_e == regs::ENC_RSP {
                //.. wait for test case for RSP case
                // REX.B distinguishes RSP from R12
                sink.put1(encode_modrm(2, enc_g & 7, 4));
                sink.put1(0x24);
                sink.put4(simm32);
            } else {
-                unreachable!("ImmReg");
+                // Displacement from RSP is encoded with a SIB byte where
                // the index and base are both encoded as RSP's encoding of
                // 0b100. This special encoding means that the index register
                // isn't used and the base is 0b100 with or without a
                // REX-encoded 4th bit (e.g. rsp or r12)
                sink.put1(encode_modrm(imm.m0d(), enc_g & 7, 0b100));
                sink.put1(0b00_100_100);
                imm.emit(sink);
            }
        }
@@ -409,23 +393,31 @@ pub(crate) fn emit_modrm_sib_disp(
            let enc_base = int_reg_enc(*reg_base);
            let enc_index = int_reg_enc(*reg_index);
-            // modrm, SIB, immediates.
+            // Encoding of ModRM/SIB bytes don't allow the index register to
-            if low8_will_sign_extend_to_32(simm32) && enc_index != regs::ENC_RSP {
+            // ever be rsp. Note, though, that the encoding of r12, whose three
-                sink.put1(encode_modrm(1, enc_g & 7, 4));
+            // lower bits match the encoding of rsp, is explicitly allowed with
-                sink.put1(encode_sib(shift, enc_index & 7, enc_base & 7));
+            // REX bytes so only rsp is disallowed.
-                sink.put1(simm32 as u8);
+            assert!(enc_index != regs::ENC_RSP);
-            } else if enc_index != regs::ENC_RSP {
+
-                sink.put1(encode_modrm(2, enc_g & 7, 4));
+            // If the offset is zero then there is no immediate. Note, though,
-                sink.put1(encode_sib(shift, enc_index & 7, enc_base & 7));
+            // that if the base register's lower three bits are `101` then an
-                sink.put4(simm32);
+            // offset must be present. This is a special case in the encoding of
-            } else {
+            // the SIB byte and requires an explicit displacement with rbp/r13.
-                panic!("ImmRegRegShift");
+            let mut imm = Imm::new(simm32);
            if enc_base & 7 == regs::ENC_RBP {
                imm.force_immediate();
            }
            // With the above determined encode the ModRM byte, then the SIB
            // byte, then any immediate as necessary.
            sink.put1(encode_modrm(imm.m0d(), enc_g & 7, 0b100));
            sink.put1(encode_sib(shift, enc_index & 7, enc_base & 7));
            imm.emit(sink);
        }
        Amode::RipRelative { ref target } => {
            // RIP-relative is mod=00, rm=101.
-            sink.put1(encode_modrm(0, enc_g & 7, 0b101));
+            sink.put1(encode_modrm(0b00, enc_g & 7, 0b101));
            let offset = sink.cur_offset();
            sink.use_label_at_offset(offset, *target, LabelUse::JmpRel32);
@@ -441,6 +433,52 @@ pub(crate) fn emit_modrm_sib_disp(
    }
 }
 enum Imm {
    None,
    Imm8(u8),
    Imm32(u32),
 }
 impl Imm {
    /// Classifies the 32-bit immediate `val` as how this can be encoded
    /// with ModRM/SIB bytes.
    fn new(val: u32) -> Imm {
        if val == 0 {
            Imm::None
        } else if low8_will_sign_extend_to_32(val) {
            Imm::Imm8(val as u8)
        } else {
            Imm::Imm32(val)
        }
    }
    /// Forces `Imm::None` to become `Imm::Imm8(0)`, used for special cases
    /// where some base registers require an immediate.
    fn force_immediate(&mut self) {
        if let Imm::None = self {
            *self = Imm::Imm8(0);
        }
    }
    /// Returns the two "mod" bits present at the upper bits of the mod/rm
    /// byte.
    fn m0d(&self) -> u8 {
        match self {
            Imm::None => 0b00,
            Imm::Imm8(_) => 0b01,
            Imm::Imm32(_) => 0b10,
        }
    }
    fn emit(&self, sink: &mut MachBuffer<Inst>) {
        match self {
            Imm::None => {}
            Imm::Imm8(n) => sink.put1(*n),
            Imm::Imm32(n) => sink.put4(*n),
        }
    }
 }
 /// This is the core 'emit' function for instructions that do not reference memory.
 ///
 /// This is conceptually the same as emit_modrm_sib_enc_ge, except it is for the case where the E
@@ -473,7 +511,7 @@ pub(crate) fn emit_std_enc_enc(
    // Now the mod/rm byte.  The instruction we're generating doesn't access
    // memory, so there is no SIB byte or immediate -- we're done.
-    sink.put1(encode_modrm(3, enc_g & 7, enc_e & 7));
+    sink.put1(encode_modrm(0b11, enc_g & 7, enc_e & 7));
 }
 // These are merely wrappers for the above two functions that facilitate passing
--- a/cranelift/filetests/filetests/isa/x64/branches.clif
+++ b/cranelift/filetests/filetests/isa/x64/branches.clif
@@ -343,7 +343,7 @@ block2:
 ;   movl %edi, %r10d
 ;   cmpl %r9d, %r10d
 ;   cmovbl %r10d, %r9d
-;   leaq 0xa(%rip), %rax
+;   leaq 9(%rip), %rax
 ;   movslq (%rax, %r9, 4), %rcx
 ;   addq %rcx, %rax
 ;   jmpq *%rax
@@ -353,14 +353,14 @@ block2:
 ;   addb %al, (%rax)
 ;   sbbb %al, (%rax)
 ;   addb %al, (%rax)
-; block2: ; offset 0x31
+; block2: ; offset 0x30
-;   jmp 0x3d
+;   jmp 0x3c
-; block3: ; offset 0x36
+; block3: ; offset 0x35
 ;   xorl %eax, %eax
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
-; block4: ; offset 0x3d
+; block4: ; offset 0x3c
 ;   movl $1, %eax
 ;   movq %rbp, %rsp
 ;   popq %rbp
@@ -938,7 +938,7 @@ block5(v5: i32):
 ;   movl %edi, %ecx
 ;   cmpl %eax, %ecx
 ;   cmovbl %ecx, %eax
-;   leaq 0xb(%rip), %r9
+;   leaq 0xa(%rip), %r9
 ;   movslq (%r9, %rax, 4), %r10
 ;   addq %r10, %r9
 ;   jmpq *%r9
@@ -950,20 +950,20 @@ block5(v5: i32):
 ;   addb %al, (%rax)
 ;   addb %dh, (%rdi)
 ;   addb %al, (%rax)
-; block2: ; offset 0x36
+; block2: ; offset 0x35
-;   jmp 0x45
+;   jmp 0x44
-; block3: ; offset 0x3b
+; block3: ; offset 0x3a
 ;   movl $3, %esi
-;   jmp 0x5e
+;   jmp 0x5d
-; block4: ; offset 0x45
+; block4: ; offset 0x44
 ;   movl $2, %esi
-;   jmp 0x5e
+;   jmp 0x5d
-; block5: ; offset 0x4f
+; block5: ; offset 0x4e
 ;   movl $1, %esi
-;   jmp 0x5e
+;   jmp 0x5d
-; block6: ; offset 0x59
+; block6: ; offset 0x58
 ;   movl $4, %esi
-; block7: ; offset 0x5e
+; block7: ; offset 0x5d
 ;   leal (%rdi, %rsi), %eax
 ;   movq %rbp, %rsp
 ;   popq %rbp
@@ -1026,7 +1026,7 @@ block1(v5: i32):
 ;   movl %edi, %r9d
 ;   cmpl %r8d, %r9d
 ;   cmovbl %r9d, %r8d
-;   leaq 0xa(%rip), %rdi
+;   leaq 9(%rip), %rdi
 ;   movslq (%rdi, %r8, 4), %rcx
 ;   addq %rcx, %rdi
 ;   jmpq *%rdi
@@ -1040,20 +1040,20 @@ block1(v5: i32):
 ;   addb %al, (%rax)
 ;   xorb $0, %al
 ;   addb %al, (%rax)
-; block2: ; offset 0x4f
+; block2: ; offset 0x4e
-;   jmp 0x6f
+;   jmp 0x6e
-; block3: ; offset 0x54
+; block3: ; offset 0x53
 ;   movq %r10, %rax
-;   jmp 0x6f
+;   jmp 0x6e
-; block4: ; offset 0x5c
+; block4: ; offset 0x5b
 ;   movq %r11, %rax
-;   jmp 0x6f
+;   jmp 0x6e
-; block5: ; offset 0x64
+; block5: ; offset 0x63
 ;   movq %r11, %rax
-;   jmp 0x6f
+;   jmp 0x6e
-; block6: ; offset 0x6c
+; block6: ; offset 0x6b
 ;   movq %rsi, %rax
-; block7: ; offset 0x6f
+; block7: ; offset 0x6e
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-arith-avx.clif
@@ -1415,6 +1415,7 @@ block0(v0: i8x16, v1: i32):
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 ;   addb %bh, %bh
 function %i8x16_shl_imm(i8x16) -> i8x16 {
 block0(v0: i8x16):
@@ -1658,7 +1659,7 @@ block0(v0: i8x16, v1: i32):
 ;   retq
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
-;   addb %bh, %bh
+;   addb %al, (%rax)
 function %i8x16_ushr_imm(i8x16) -> i8x16 {
 block0(v0: i8x16):
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -365,6 +365,7 @@ block0(v0: i32):
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %bh, %bh
 function %ishl_i8x16_imm(i8x16) -> i8x16 {
 block0(v0: i8x16):