x64: Only branch once in br_table (#5850)
This uses the `cmov`, which was previously necessary for Spectre mitigation, to clamp the table index instead of zeroing it. By then placing the default target as the last entry in the table, we can use just one branch instruction in all cases. Since there isn't a bounds-check branch any more, this sequence no longer needs Spectre mitigation. And since we don't need to be careful about preserving flags, half the instructions can be removed from this pseudoinstruction and emitted as regular instructions instead. This is a net savings of three bytes in the encoding of x64's br_table pseudoinstruction. The generated code can sometimes be longer overall because the blocks are emitted in a slightly different order. My benchmark results show a very small effect on runtime performance with this change. The spidermonkey benchmark in Sightglass runs "1.01x faster" than main by instructions retired, but with no significant difference in CPU cycles. I think that means it rarely hit the default case in any br_table instructions it executed. The pulldown-cmark benchmark in Sightglass runs "1.01x faster" than main by CPU cycles, but main runs "1.00x faster" by instructions retired. I think that means this benchmark hit the default case a significant amount of the time, so it executes a few more instructions per br_table, but maybe the branches were predicted better.
This commit is contained in:
@@ -4049,16 +4049,10 @@
|
||||
;; wasm-table index) and then 64-bits (address addend). The small
|
||||
;; lie about the I64 type is benign, since the temporary is dead
|
||||
;; after this instruction (and its Cranelift type is thus unused).
|
||||
(tmp2 WritableGpr (temp_writable_gpr))
|
||||
(tmp2 WritableGpr (temp_writable_gpr)))
|
||||
|
||||
(size OperandSize (raw_operand_size_of_type ty))
|
||||
|
||||
(jt_size u32 (jump_table_size jt_targets)))
|
||||
|
||||
(with_flags_side_effect
|
||||
(x64_cmp size (RegMemImm.Imm jt_size) idx)
|
||||
(ConsumesFlags.ConsumesFlagsSideEffect
|
||||
(MInst.JmpTableSeq idx tmp1 tmp2 default_target jt_targets)))))
|
||||
(SideEffectNoResult.Inst
|
||||
(MInst.JmpTableSeq idx tmp1 tmp2 default_target jt_targets))))
|
||||
|
||||
;;;; iadd_pairwise constants ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
@@ -1596,42 +1596,15 @@ pub(crate) fn emit(
|
||||
// maximum range of 2 GB. If we later consider using shorter-range label references,
|
||||
// this will need to be revisited.
|
||||
|
||||
// Save index in a tmp (the live range of ridx only goes to start of this
|
||||
// sequence; rtmp1 or rtmp2 may overwrite it).
|
||||
|
||||
// We generate the following sequence:
|
||||
// ;; generated by lowering: cmp #jmp_table_size, %idx
|
||||
// jnb $default_target
|
||||
// movl %idx, %tmp2
|
||||
// mov $0, %tmp1
|
||||
// cmovnb %tmp1, %tmp2 ;; Spectre mitigation.
|
||||
// We generate the following sequence. Note that the only read of %idx is before the
|
||||
// write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs
|
||||
// if you change this.
|
||||
// lea start_of_jump_table_offset(%rip), %tmp1
|
||||
// movslq [%tmp1, %tmp2, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
|
||||
// movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
|
||||
// addq %tmp2, %tmp1
|
||||
// j *%tmp1
|
||||
// $start_of_jump_table:
|
||||
// -- jump table entries
|
||||
one_way_jmp(sink, CC::NB, *default_target); // idx unsigned >= jmp table size
|
||||
|
||||
// Copy the index (and make sure to clear the high 32-bits lane of tmp2).
|
||||
let inst = Inst::movzx_rm_r(ExtMode::LQ, RegMem::reg(idx), tmp2);
|
||||
inst.emit(&[], sink, info, state);
|
||||
|
||||
// Zero `tmp1` to overwrite `tmp2` with zeroes on the
|
||||
// out-of-bounds case (Spectre mitigation using CMOV).
|
||||
// Note that we need to do this with a move-immediate
|
||||
// form, because we cannot clobber the flags.
|
||||
let inst = Inst::imm(OperandSize::Size32, 0, tmp1);
|
||||
inst.emit(&[], sink, info, state);
|
||||
|
||||
// Spectre mitigation: CMOV to zero the index if the out-of-bounds branch above misspeculated.
|
||||
let inst = Inst::cmove(
|
||||
OperandSize::Size64,
|
||||
CC::NB,
|
||||
RegMem::reg(tmp1.to_reg()),
|
||||
tmp2,
|
||||
);
|
||||
inst.emit(&[], sink, info, state);
|
||||
|
||||
// Load base address of jump table.
|
||||
let start_of_jumptable = sink.get_label();
|
||||
@@ -1645,7 +1618,7 @@ pub(crate) fn emit(
|
||||
RegMem::mem(Amode::imm_reg_reg_shift(
|
||||
0,
|
||||
Gpr::new(tmp1.to_reg()).unwrap(),
|
||||
Gpr::new(tmp2.to_reg()).unwrap(),
|
||||
Gpr::new(idx).unwrap(),
|
||||
2,
|
||||
)),
|
||||
tmp2,
|
||||
@@ -1668,7 +1641,7 @@ pub(crate) fn emit(
|
||||
// Emit jump table (table of 32-bit offsets).
|
||||
sink.bind_label(start_of_jumptable);
|
||||
let jt_off = sink.cur_offset();
|
||||
for &target in targets.iter() {
|
||||
for &target in targets.iter().chain(std::iter::once(default_target)) {
|
||||
let word_off = sink.cur_offset();
|
||||
// off_into_table is an addend here embedded in the label to be later patched at
|
||||
// the end of codegen. The offset is initially relative to this jump table entry;
|
||||
|
||||
@@ -2216,7 +2216,10 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
|
||||
} => {
|
||||
collector.reg_use(*idx);
|
||||
collector.reg_early_def(*tmp1);
|
||||
collector.reg_early_def(*tmp2);
|
||||
// In the sequence emitted for this pseudoinstruction in emit.rs,
|
||||
// tmp2 is only written after idx is read, so it doesn't need to be
|
||||
// an early def.
|
||||
collector.reg_def(*tmp2);
|
||||
}
|
||||
|
||||
Inst::JmpUnknown { target } => {
|
||||
|
||||
@@ -2923,7 +2923,14 @@
|
||||
;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower_branch (br_table idx @ (value_type ty) _) (jump_table_targets default_target jt_targets))
|
||||
(emit_side_effect (jmp_table_seq ty idx default_target jt_targets)))
|
||||
(let ((size OperandSize (raw_operand_size_of_type ty))
|
||||
(jt_size u32 (jump_table_size jt_targets))
|
||||
(size_reg Reg (imm ty (u32_as_u64 jt_size)))
|
||||
(idx_reg Gpr (extend_to_gpr idx $I64 (ExtendKind.Zero)))
|
||||
(clamped_idx Reg (with_flags_reg
|
||||
(x64_cmp size size_reg idx_reg)
|
||||
(cmove ty (CC.B) idx_reg size_reg))))
|
||||
(emit_side_effect (jmp_table_seq ty clamped_idx default_target jt_targets))))
|
||||
|
||||
;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
@@ -314,8 +314,11 @@ block2:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; cmpl $2, %edi
|
||||
; br_table %rdi, %r8, %r9
|
||||
; movl $2, %r9d
|
||||
; movl %edi, %r10d
|
||||
; cmpl %r9d, %r10d
|
||||
; cmovbl %r10d, %r9d, %r9d
|
||||
; br_table %r9, %rax, %rcx
|
||||
; block1:
|
||||
; jmp label4
|
||||
; block2:
|
||||
@@ -336,25 +339,28 @@ block2:
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; cmpl $2, %edi
|
||||
; jae 0x40
|
||||
; movl %edi, %r9d
|
||||
; movl $0, %r8d
|
||||
; cmovaeq %r8, %r9
|
||||
; leaq 0xb(%rip), %r8
|
||||
; movslq (%r8, %r9, 4), %r9
|
||||
; addq %r9, %r8
|
||||
; jmpq *%r8
|
||||
; adcb $0, %al
|
||||
; movl $2, %r9d
|
||||
; movl %edi, %r10d
|
||||
; cmpl %r9d, %r10d
|
||||
; cmovbl %r10d, %r9d
|
||||
; leaq 0xa(%rip), %rax
|
||||
; movslq (%rax, %r9, 4), %rcx
|
||||
; addq %rcx, %rax
|
||||
; jmpq *%rax
|
||||
; sbbb %al, (%rax)
|
||||
; addb %al, (%rax)
|
||||
; block2: ; offset 0x34
|
||||
; jmp 0x40
|
||||
; block3: ; offset 0x39
|
||||
; adcl %eax, (%rax)
|
||||
; addb %al, (%rax)
|
||||
; sbbb %al, (%rax)
|
||||
; addb %al, (%rax)
|
||||
; block2: ; offset 0x31
|
||||
; jmp 0x3d
|
||||
; block3: ; offset 0x36
|
||||
; xorl %eax, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
; block4: ; offset 0x40
|
||||
; block4: ; offset 0x3d
|
||||
; movl $1, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
@@ -756,27 +762,30 @@ block5(v5: i32):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; cmpl $4, %edi
|
||||
; br_table %rdi, %rsi, %rax
|
||||
; movl $4, %eax
|
||||
; movl %edi, %ecx
|
||||
; cmpl %eax, %ecx
|
||||
; cmovbl %ecx, %eax, %eax
|
||||
; br_table %rax, %r9, %r10
|
||||
; block1:
|
||||
; jmp label4
|
||||
; block2:
|
||||
; jmp label4
|
||||
; block3:
|
||||
; movl $3, %r9d
|
||||
; movl $3, %esi
|
||||
; jmp label7
|
||||
; block4:
|
||||
; movl $2, %r9d
|
||||
; movl $2, %esi
|
||||
; jmp label7
|
||||
; block5:
|
||||
; movl $1, %r9d
|
||||
; movl $1, %esi
|
||||
; jmp label7
|
||||
; block6:
|
||||
; movl $4, %r9d
|
||||
; movl $4, %esi
|
||||
; jmp label7
|
||||
; block7:
|
||||
; movq %rdi, %rax
|
||||
; addl %eax, %r9d, %eax
|
||||
; addl %eax, %esi, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; ret
|
||||
@@ -786,37 +795,38 @@ block5(v5: i32):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; cmpl $4, %edi
|
||||
; jae 0x5f
|
||||
; movl %edi, %eax
|
||||
; movl $0, %esi
|
||||
; cmovaeq %rsi, %rax
|
||||
; leaq 0xa(%rip), %rsi
|
||||
; movslq (%rsi, %rax, 4), %rax
|
||||
; addq %rax, %rsi
|
||||
; jmpq *%rsi
|
||||
; subl (%rax), %eax
|
||||
; movl $4, %eax
|
||||
; movl %edi, %ecx
|
||||
; cmpl %eax, %ecx
|
||||
; cmovbl %ecx, %eax
|
||||
; leaq 0xb(%rip), %r9
|
||||
; movslq (%r9, %rax, 4), %r10
|
||||
; addq %r10, %r9
|
||||
; jmpq *%r9
|
||||
; subl $0x23000000, %eax
|
||||
; addb %al, (%rax)
|
||||
; andb %al, (%rax)
|
||||
; addb %ah, (%rbx)
|
||||
; addb %al, (%rax)
|
||||
; andb %al, (%rax)
|
||||
; addb %bl, (%rcx)
|
||||
; addb %al, (%rax)
|
||||
; block2: ; offset 0x39
|
||||
; jmp 0x49
|
||||
; block3: ; offset 0x3e
|
||||
; movl $3, %r9d
|
||||
; jmp 0x65
|
||||
; block4: ; offset 0x49
|
||||
; movl $2, %r9d
|
||||
; jmp 0x65
|
||||
; block5: ; offset 0x54
|
||||
; movl $1, %r9d
|
||||
; jmp 0x65
|
||||
; block6: ; offset 0x5f
|
||||
; movl $4, %r9d
|
||||
; block7: ; offset 0x65
|
||||
; addb %dh, (%rdi)
|
||||
; addb %al, (%rax)
|
||||
; block2: ; offset 0x36
|
||||
; jmp 0x45
|
||||
; block3: ; offset 0x3b
|
||||
; movl $3, %esi
|
||||
; jmp 0x5e
|
||||
; block4: ; offset 0x45
|
||||
; movl $2, %esi
|
||||
; jmp 0x5e
|
||||
; block5: ; offset 0x4f
|
||||
; movl $1, %esi
|
||||
; jmp 0x5e
|
||||
; block6: ; offset 0x59
|
||||
; movl $4, %esi
|
||||
; block7: ; offset 0x5e
|
||||
; movq %rdi, %rax
|
||||
; addl %r9d, %eax
|
||||
; addl %esi, %eax
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
@@ -837,25 +847,28 @@ block1(v5: i32):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block0:
|
||||
; movl $1, %edx
|
||||
; movl $2, %r8d
|
||||
; movl $3, %r9d
|
||||
; movl $1, %r10d
|
||||
; movl $2, %r11d
|
||||
; movl $3, %esi
|
||||
; movl $4, %eax
|
||||
; cmpl $4, %edi
|
||||
; br_table %rdi, %r11, %r10
|
||||
; movl $4, %r8d
|
||||
; movl %edi, %r9d
|
||||
; cmpl %r8d, %r9d
|
||||
; cmovbl %r9d, %r8d, %r8d
|
||||
; br_table %r8, %rdi, %rcx
|
||||
; block1:
|
||||
; jmp label6
|
||||
; block2:
|
||||
; movq %rdx, %rax
|
||||
; movq %r10, %rax
|
||||
; jmp label6
|
||||
; block3:
|
||||
; movq %r8, %rax
|
||||
; movq %r11, %rax
|
||||
; jmp label6
|
||||
; block4:
|
||||
; movq %r8, %rax
|
||||
; movq %r11, %rax
|
||||
; jmp label6
|
||||
; block5:
|
||||
; movq %r9, %rax
|
||||
; movq %rsi, %rax
|
||||
; jmp label6
|
||||
; block6:
|
||||
; movq %rbp, %rsp
|
||||
@@ -867,37 +880,42 @@ block1(v5: i32):
|
||||
; pushq %rbp
|
||||
; movq %rsp, %rbp
|
||||
; block1: ; offset 0x4
|
||||
; movl $1, %edx
|
||||
; movl $2, %r8d
|
||||
; movl $3, %r9d
|
||||
; movl $1, %r10d
|
||||
; movl $2, %r11d
|
||||
; movl $3, %esi
|
||||
; movl $4, %eax
|
||||
; cmpl $4, %edi
|
||||
; jae 0x72
|
||||
; movl %edi, %r10d
|
||||
; movl $0, %r11d
|
||||
; cmovaeq %r11, %r10
|
||||
; leaq 0xb(%rip), %r11
|
||||
; movslq (%r11, %r10, 4), %r10
|
||||
; addq %r10, %r11
|
||||
; jmpq *%r11
|
||||
; adcl $0x1d000000, %eax
|
||||
; movl $4, %r8d
|
||||
; movl %edi, %r9d
|
||||
; cmpl %r8d, %r9d
|
||||
; cmovbl %r9d, %r8d
|
||||
; leaq 0xa(%rip), %rdi
|
||||
; movslq (%rdi, %r8, 4), %rcx
|
||||
; addq %rcx, %rdi
|
||||
; jmpq *%rdi
|
||||
; sbbl %eax, (%rax)
|
||||
; addb %al, (%rax)
|
||||
; addb %ah, 0x2d000000(%rip)
|
||||
; andl %eax, (%rax)
|
||||
; addb %al, (%rax)
|
||||
; block2: ; offset 0x52
|
||||
; jmp 0x72
|
||||
; block3: ; offset 0x57
|
||||
; movq %rdx, %rax
|
||||
; jmp 0x72
|
||||
; block4: ; offset 0x5f
|
||||
; movq %r8, %rax
|
||||
; jmp 0x72
|
||||
; block5: ; offset 0x67
|
||||
; movq %r8, %rax
|
||||
; jmp 0x72
|
||||
; block6: ; offset 0x6f
|
||||
; movq %r9, %rax
|
||||
; block7: ; offset 0x72
|
||||
; subl %eax, (%rax)
|
||||
; addb %al, (%rax)
|
||||
; xorl %eax, (%rax)
|
||||
; addb %al, (%rax)
|
||||
; xorb $0, %al
|
||||
; addb %al, (%rax)
|
||||
; block2: ; offset 0x4f
|
||||
; jmp 0x6f
|
||||
; block3: ; offset 0x54
|
||||
; movq %r10, %rax
|
||||
; jmp 0x6f
|
||||
; block4: ; offset 0x5c
|
||||
; movq %r11, %rax
|
||||
; jmp 0x6f
|
||||
; block5: ; offset 0x64
|
||||
; movq %r11, %rax
|
||||
; jmp 0x6f
|
||||
; block6: ; offset 0x6c
|
||||
; movq %rsi, %rax
|
||||
; block7: ; offset 0x6f
|
||||
; movq %rbp, %rsp
|
||||
; popq %rbp
|
||||
; retq
|
||||
|
||||
Reference in New Issue
Block a user