x64: Clarify and shrink up ModRM/SIB encoding (#6181)

I noticed recently that for the `ImmRegRegShift` addressing mode
Cranelift will unconditionally emit at least a 1-byte immediate for the
offset to be added to the register addition computation, even when the
offset is zero. In this case though the instruction encoding can be
slightly more compact and remove a byte. This commit started off by
applying this optimization, which resulted in the `*.clif` test changes
in this commit.

Further reading this code, however, I personally found it quite hard to
follow what was happening with all the various branches and ModRM/SIB
bits. I reviewed these encodings in the x64 architecture manual and
attempted to improve the logic for encoding here. The new version in
this commit is intended to be functionally equivalent to the prior
version where dropping a zero-offset from the `ImmRegRegShift` variant
is the only change.
This commit is contained in:
Alex Crichton
2023-04-10 14:37:19 -05:00
committed by GitHub
parent 8f1a7773a3
commit 435b6894d7
4 changed files with 119 additions and 79 deletions

View File

@@ -356,46 +356,30 @@ pub(crate) fn emit_modrm_sib_disp(
match *mem_e { match *mem_e {
Amode::ImmReg { simm32, base, .. } => { Amode::ImmReg { simm32, base, .. } => {
let enc_e = int_reg_enc(base); let enc_e = int_reg_enc(base);
let mut imm = Imm::new(simm32);
// Now the mod/rm and associated immediates. This is // Most base registers allow for a single ModRM byte plus an
// significantly complicated due to the multiple special cases. // optional immediate. If rsp is the base register, however, then a
if simm32 == 0 // SIB byte must be used.
&& enc_e != regs::ENC_RSP let enc_e_low3 = enc_e & 7;
&& enc_e != regs::ENC_RBP if enc_e_low3 != regs::ENC_RSP {
&& enc_e != regs::ENC_R12 // If the base register is rbp and there's no offset then force
&& enc_e != regs::ENC_R13 // a 1-byte zero offset since otherwise the encoding would be
{ // invalid.
// FIXME JRS 2020Feb11: those four tests can surely be if enc_e_low3 == regs::ENC_RBP {
// replaced by a single mask-and-compare check. We should do imm.force_immediate();
// that because this routine is likely to be hot. }
sink.put1(encode_modrm(0, enc_g & 7, enc_e & 7)); sink.put1(encode_modrm(imm.m0d(), enc_g & 7, enc_e & 7));
} else if simm32 == 0 && (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) { imm.emit(sink);
sink.put1(encode_modrm(0, enc_g & 7, 4));
sink.put1(0x24);
} else if low8_will_sign_extend_to_32(simm32)
&& enc_e != regs::ENC_RSP
&& enc_e != regs::ENC_R12
{
sink.put1(encode_modrm(1, enc_g & 7, enc_e & 7));
sink.put1((simm32 & 0xFF) as u8);
} else if enc_e != regs::ENC_RSP && enc_e != regs::ENC_R12 {
sink.put1(encode_modrm(2, enc_g & 7, enc_e & 7));
sink.put4(simm32);
} else if (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12)
&& low8_will_sign_extend_to_32(simm32)
{
// REX.B distinguishes RSP from R12
sink.put1(encode_modrm(1, enc_g & 7, 4));
sink.put1(0x24);
sink.put1((simm32 & 0xFF) as u8);
} else if enc_e == regs::ENC_R12 || enc_e == regs::ENC_RSP {
//.. wait for test case for RSP case
// REX.B distinguishes RSP from R12
sink.put1(encode_modrm(2, enc_g & 7, 4));
sink.put1(0x24);
sink.put4(simm32);
} else { } else {
unreachable!("ImmReg"); // Displacement from RSP is encoded with a SIB byte where
// the index and base are both encoded as RSP's encoding of
// 0b100. This special encoding means that the index register
// isn't used and the base is 0b100 with or without a
// REX-encoded 4th bit (e.g. rsp or r12)
sink.put1(encode_modrm(imm.m0d(), enc_g & 7, 0b100));
sink.put1(0b00_100_100);
imm.emit(sink);
} }
} }
@@ -409,23 +393,31 @@ pub(crate) fn emit_modrm_sib_disp(
let enc_base = int_reg_enc(*reg_base); let enc_base = int_reg_enc(*reg_base);
let enc_index = int_reg_enc(*reg_index); let enc_index = int_reg_enc(*reg_index);
// modrm, SIB, immediates. // Encoding of ModRM/SIB bytes don't allow the index register to
if low8_will_sign_extend_to_32(simm32) && enc_index != regs::ENC_RSP { // ever be rsp. Note, though, that the encoding of r12, whose three
sink.put1(encode_modrm(1, enc_g & 7, 4)); // lower bits match the encoding of rsp, is explicitly allowed with
sink.put1(encode_sib(shift, enc_index & 7, enc_base & 7)); // REX bytes so only rsp is disallowed.
sink.put1(simm32 as u8); assert!(enc_index != regs::ENC_RSP);
} else if enc_index != regs::ENC_RSP {
sink.put1(encode_modrm(2, enc_g & 7, 4)); // If the offset is zero then there is no immediate. Note, though,
sink.put1(encode_sib(shift, enc_index & 7, enc_base & 7)); // that if the base register's lower three bits are `101` then an
sink.put4(simm32); // offset must be present. This is a special case in the encoding of
} else { // the SIB byte and requires an explicit displacement with rbp/r13.
panic!("ImmRegRegShift"); let mut imm = Imm::new(simm32);
if enc_base & 7 == regs::ENC_RBP {
imm.force_immediate();
} }
// With the above determined encode the ModRM byte, then the SIB
// byte, then any immediate as necessary.
sink.put1(encode_modrm(imm.m0d(), enc_g & 7, 0b100));
sink.put1(encode_sib(shift, enc_index & 7, enc_base & 7));
imm.emit(sink);
} }
Amode::RipRelative { ref target } => { Amode::RipRelative { ref target } => {
// RIP-relative is mod=00, rm=101. // RIP-relative is mod=00, rm=101.
sink.put1(encode_modrm(0, enc_g & 7, 0b101)); sink.put1(encode_modrm(0b00, enc_g & 7, 0b101));
let offset = sink.cur_offset(); let offset = sink.cur_offset();
sink.use_label_at_offset(offset, *target, LabelUse::JmpRel32); sink.use_label_at_offset(offset, *target, LabelUse::JmpRel32);
@@ -441,6 +433,52 @@ pub(crate) fn emit_modrm_sib_disp(
} }
} }
enum Imm {
None,
Imm8(u8),
Imm32(u32),
}
impl Imm {
/// Classifies the 32-bit immediate `val` as how this can be encoded
/// with ModRM/SIB bytes.
fn new(val: u32) -> Imm {
if val == 0 {
Imm::None
} else if low8_will_sign_extend_to_32(val) {
Imm::Imm8(val as u8)
} else {
Imm::Imm32(val)
}
}
/// Forces `Imm::None` to become `Imm::Imm8(0)`, used for special cases
/// where some base registers require an immediate.
fn force_immediate(&mut self) {
if let Imm::None = self {
*self = Imm::Imm8(0);
}
}
/// Returns the two "mod" bits present at the upper bits of the mod/rm
/// byte.
fn m0d(&self) -> u8 {
match self {
Imm::None => 0b00,
Imm::Imm8(_) => 0b01,
Imm::Imm32(_) => 0b10,
}
}
fn emit(&self, sink: &mut MachBuffer<Inst>) {
match self {
Imm::None => {}
Imm::Imm8(n) => sink.put1(*n),
Imm::Imm32(n) => sink.put4(*n),
}
}
}
/// This is the core 'emit' function for instructions that do not reference memory. /// This is the core 'emit' function for instructions that do not reference memory.
/// ///
/// This is conceptually the same as emit_modrm_sib_enc_ge, except it is for the case where the E /// This is conceptually the same as emit_modrm_sib_enc_ge, except it is for the case where the E
@@ -473,7 +511,7 @@ pub(crate) fn emit_std_enc_enc(
// Now the mod/rm byte. The instruction we're generating doesn't access // Now the mod/rm byte. The instruction we're generating doesn't access
// memory, so there is no SIB byte or immediate -- we're done. // memory, so there is no SIB byte or immediate -- we're done.
sink.put1(encode_modrm(3, enc_g & 7, enc_e & 7)); sink.put1(encode_modrm(0b11, enc_g & 7, enc_e & 7));
} }
// These are merely wrappers for the above two functions that facilitate passing // These are merely wrappers for the above two functions that facilitate passing

View File

@@ -343,7 +343,7 @@ block2:
; movl %edi, %r10d ; movl %edi, %r10d
; cmpl %r9d, %r10d ; cmpl %r9d, %r10d
; cmovbl %r10d, %r9d ; cmovbl %r10d, %r9d
; leaq 0xa(%rip), %rax ; leaq 9(%rip), %rax
; movslq (%rax, %r9, 4), %rcx ; movslq (%rax, %r9, 4), %rcx
; addq %rcx, %rax ; addq %rcx, %rax
; jmpq *%rax ; jmpq *%rax
@@ -353,14 +353,14 @@ block2:
; addb %al, (%rax) ; addb %al, (%rax)
; sbbb %al, (%rax) ; sbbb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; block2: ; offset 0x31 ; block2: ; offset 0x30
; jmp 0x3d ; jmp 0x3c
; block3: ; offset 0x36 ; block3: ; offset 0x35
; xorl %eax, %eax ; xorl %eax, %eax
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; block4: ; offset 0x3d ; block4: ; offset 0x3c
; movl $1, %eax ; movl $1, %eax
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
@@ -938,7 +938,7 @@ block5(v5: i32):
; movl %edi, %ecx ; movl %edi, %ecx
; cmpl %eax, %ecx ; cmpl %eax, %ecx
; cmovbl %ecx, %eax ; cmovbl %ecx, %eax
; leaq 0xb(%rip), %r9 ; leaq 0xa(%rip), %r9
; movslq (%r9, %rax, 4), %r10 ; movslq (%r9, %rax, 4), %r10
; addq %r10, %r9 ; addq %r10, %r9
; jmpq *%r9 ; jmpq *%r9
@@ -950,20 +950,20 @@ block5(v5: i32):
; addb %al, (%rax) ; addb %al, (%rax)
; addb %dh, (%rdi) ; addb %dh, (%rdi)
; addb %al, (%rax) ; addb %al, (%rax)
; block2: ; offset 0x36 ; block2: ; offset 0x35
; jmp 0x45 ; jmp 0x44
; block3: ; offset 0x3b ; block3: ; offset 0x3a
; movl $3, %esi ; movl $3, %esi
; jmp 0x5e ; jmp 0x5d
; block4: ; offset 0x45 ; block4: ; offset 0x44
; movl $2, %esi ; movl $2, %esi
; jmp 0x5e ; jmp 0x5d
; block5: ; offset 0x4f ; block5: ; offset 0x4e
; movl $1, %esi ; movl $1, %esi
; jmp 0x5e ; jmp 0x5d
; block6: ; offset 0x59 ; block6: ; offset 0x58
; movl $4, %esi ; movl $4, %esi
; block7: ; offset 0x5e ; block7: ; offset 0x5d
; leal (%rdi, %rsi), %eax ; leal (%rdi, %rsi), %eax
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
@@ -1026,7 +1026,7 @@ block1(v5: i32):
; movl %edi, %r9d ; movl %edi, %r9d
; cmpl %r8d, %r9d ; cmpl %r8d, %r9d
; cmovbl %r9d, %r8d ; cmovbl %r9d, %r8d
; leaq 0xa(%rip), %rdi ; leaq 9(%rip), %rdi
; movslq (%rdi, %r8, 4), %rcx ; movslq (%rdi, %r8, 4), %rcx
; addq %rcx, %rdi ; addq %rcx, %rdi
; jmpq *%rdi ; jmpq *%rdi
@@ -1040,20 +1040,20 @@ block1(v5: i32):
; addb %al, (%rax) ; addb %al, (%rax)
; xorb $0, %al ; xorb $0, %al
; addb %al, (%rax) ; addb %al, (%rax)
; block2: ; offset 0x4f ; block2: ; offset 0x4e
; jmp 0x6f ; jmp 0x6e
; block3: ; offset 0x54 ; block3: ; offset 0x53
; movq %r10, %rax ; movq %r10, %rax
; jmp 0x6f ; jmp 0x6e
; block4: ; offset 0x5c ; block4: ; offset 0x5b
; movq %r11, %rax ; movq %r11, %rax
; jmp 0x6f ; jmp 0x6e
; block5: ; offset 0x64 ; block5: ; offset 0x63
; movq %r11, %rax ; movq %r11, %rax
; jmp 0x6f ; jmp 0x6e
; block6: ; offset 0x6c ; block6: ; offset 0x6b
; movq %rsi, %rax ; movq %rsi, %rax
; block7: ; offset 0x6f ; block7: ; offset 0x6e
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq

View File

@@ -1415,6 +1415,7 @@ block0(v0: i8x16, v1: i32):
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; retq ; retq
; addb %bh, %bh
function %i8x16_shl_imm(i8x16) -> i8x16 { function %i8x16_shl_imm(i8x16) -> i8x16 {
block0(v0: i8x16): block0(v0: i8x16):
@@ -1658,7 +1659,7 @@ block0(v0: i8x16, v1: i32):
; retq ; retq
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %bh, %bh ; addb %al, (%rax)
function %i8x16_ushr_imm(i8x16) -> i8x16 { function %i8x16_ushr_imm(i8x16) -> i8x16 {
block0(v0: i8x16): block0(v0: i8x16):

View File

@@ -365,6 +365,7 @@ block0(v0: i32):
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %al, (%rax) ; addb %al, (%rax)
; addb %bh, %bh
function %ishl_i8x16_imm(i8x16) -> i8x16 { function %ishl_i8x16_imm(i8x16) -> i8x16 {
block0(v0: i8x16): block0(v0: i8x16):