Improve efficiency of resetting stack pointer

This commit is contained in:
Jef
2019-03-27 12:57:49 +01:00
parent 652e2fdeec
commit 79f26328d4
3 changed files with 25 additions and 69 deletions

View File

@@ -142,18 +142,16 @@ fib:
push rax push rax
push rax push rax
mov rsi, rcx mov rsi, rcx
call fib call 0
add eax, dword ptr [rsp + 8] add eax, [rsp + 8]
mov rcx, qword ptr [rsp + 0x10] mov rcx, [rsp + 0x10]
add ecx, 0xfffffffe add ecx, 0xfffffffe
cmp ecx, 1 cmp ecx, 1
mov rsi, rcx mov rsi, rcx
pop rcx lea rsp, [rsp + 0x18]
pop rcx
pop rcx
ja .Lloop ja .Lloop
.Lreturn: .Lreturn:
ret ret
``` ```
Now obviously I'm not advocating for replacing FireFox's optimising compiler with Lightbeam since the latter can only really produce better code when receiving optimised WebAssembly (and so debug-mode or hand-written WebAssembly may produce much worse output). However, this shows that even with the restrictions of a streaming compiler it's absolutely possible to produce high-quality assembly output. For the assembly above, the Lightbeam output runs within 15% of native speed. This is paramount for one of Lightbeam's intended usecases for real-time systems that want good runtime performance but cannot tolerate compiler bombs. Now obviously I'm not advocating for replacing FireFox's optimising compiler with Lightbeam since the latter can only really produce better code when receiving optimised WebAssembly (and so debug-mode or hand-written WebAssembly may produce much worse output). However, this shows that even with the restrictions of a streaming compiler it's absolutely possible to produce high-quality assembly output. For the assembly above, the Lightbeam output runs within 15% of native speed. This is paramount for one of Lightbeam's intended usecases for real-time systems that want good runtime performance but cannot tolerate compiler bombs.

View File

@@ -2262,38 +2262,32 @@ impl<'this, M: ModuleContext> Context<'this, M> {
self.free_value(selector); self.free_value(selector);
} }
fn set_stack_depth_preserve_flags(&mut self, depth: StackDepth) {
if self.block_state.depth.0 < depth.0 {
for _ in 0..depth.0 - self.block_state.depth.0 {
dynasm!(self.asm
; push rax
);
}
} else if self.block_state.depth.0 > depth.0 {
let trash = self.take_reg(I64);
for _ in 0..self.block_state.depth.0 - depth.0 {
dynasm!(self.asm
; pop Rq(trash.rq().unwrap())
);
}
self.block_state.regs.release(trash);
}
self.block_state.depth = depth;
}
fn set_stack_depth(&mut self, depth: StackDepth) { fn set_stack_depth(&mut self, depth: StackDepth) {
if self.block_state.depth.0 != depth.0 { if self.block_state.depth.0 != depth.0 {
let diff = depth.0 as i32 - self.block_state.depth.0 as i32; let diff = depth.0 as i32 - self.block_state.depth.0 as i32;
if diff.abs() == 1 { if diff.abs() == 1 {
self.set_stack_depth_preserve_flags(depth); if self.block_state.depth.0 < depth.0 {
for _ in 0..depth.0 - self.block_state.depth.0 {
dynasm!(self.asm
; push rax
);
}
} else if self.block_state.depth.0 > depth.0 {
let trash = self.take_reg(I64);
for _ in 0..self.block_state.depth.0 - depth.0 {
dynasm!(self.asm
; pop Rq(trash.rq().unwrap())
);
}
self.block_state.regs.release(trash);
}
} else { } else {
dynasm!(self.asm dynasm!(self.asm
; add rsp, (self.block_state.depth.0 as i32 - depth.0 as i32) * WORD_SIZE as i32 ; lea rsp, [rsp + (self.block_state.depth.0 as i32 - depth.0 as i32) * WORD_SIZE as i32]
); );
self.block_state.depth = depth;
} }
self.block_state.depth = depth;
} }
} }
@@ -2325,43 +2319,6 @@ impl<'this, M: ModuleContext> Context<'this, M> {
self.set_stack_depth(cc.stack_depth); self.set_stack_depth(cc.stack_depth);
} }
pub fn pass_block_args_preserve_flags(&mut self, cc: &BlockCallingConvention) {
self.do_pass_block_args(cc);
self.set_stack_depth_preserve_flags(cc.stack_depth);
}
pub fn serialize_block_args_preserve_flags(
&mut self,
cc: &BlockCallingConvention,
other_to_drop: Option<RangeInclusive<u32>>,
) -> BlockCallingConvention {
self.do_pass_block_args(cc);
let mut out_args = cc.arguments.clone();
out_args.reverse();
if let Some(to_drop) = other_to_drop {
for _ in to_drop {
let val = self.pop();
// TODO: We can use stack slots for values already on the stack but we
// don't refcount stack slots right now
let loc = CCLoc::Reg(self.into_temp_reg(None, val));
out_args.push(loc);
}
}
out_args.reverse();
self.set_stack_depth_preserve_flags(cc.stack_depth);
BlockCallingConvention {
stack_depth: cc.stack_depth,
arguments: out_args,
}
}
pub fn serialize_block_args( pub fn serialize_block_args(
&mut self, &mut self,
cc: &BlockCallingConvention, cc: &BlockCallingConvention,
@@ -5054,3 +5011,4 @@ impl IntoLabel for (LabelValue, LabelValue) {
Box::new(const_values(self.0, self.1)) Box::new(const_values(self.0, self.1))
} }
} }

View File

@@ -314,7 +314,7 @@ where
((Some(Left(ref cc)), to_drop), ref mut other @ (None, _)) ((Some(Left(ref cc)), to_drop), ref mut other @ (None, _))
| (ref mut other @ (None, _), (Some(Left(ref cc)), to_drop)) => { | (ref mut other @ (None, _), (Some(Left(ref cc)), to_drop)) => {
let mut cc = let mut cc =
ctx.serialize_block_args_preserve_flags(cc, to_drop.clone()); ctx.serialize_block_args(cc, to_drop.clone());
if let Some(to_drop) = other.1 { if let Some(to_drop) = other.1 {
drop_elements(&mut cc.arguments, to_drop.clone()); drop_elements(&mut cc.arguments, to_drop.clone());
} }