diff --git a/src/backend.rs b/src/backend.rs index 93d3aaff8d..90d15cac19 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -1570,7 +1570,7 @@ impl Context<'_, M> { &mut self, targets: I, default: Option>, - mut pass_args: impl FnOnce(&mut Self), + pass_args: impl FnOnce(&mut Self), ) where I: IntoIterator>, I::IntoIter: ExactSizeIterator, @@ -2145,7 +2145,272 @@ impl Context<'_, M> { I64 ); - // `i64_mul` needs to be seperate because the immediate form of the instruction + /// Returned divisor is guaranteed not to be `RAX` + // TODO: With a proper SSE-like "Value" system we could do this way better (we wouldn't have + // to move `RAX` back afterwards). + fn i32_full_div( + &mut self, + divisor: ValueLocation, + quotient: ValueLocation, + do_div: impl FnOnce(&mut Self, ValueLocation), + ) -> (ValueLocation, ValueLocation, Option) { + let divisor = if ValueLocation::Reg(RAX) == divisor { + let new_reg = self.block_state.regs.take(I32); + self.copy_value(&divisor, &mut ValueLocation::Reg(new_reg)); + self.block_state.regs.release(RAX); + ValueLocation::Reg(new_reg) + } else if let ValueLocation::Stack(_) = divisor { + divisor + } else { + ValueLocation::Reg(self.into_temp_reg(I32, divisor)) + }; + + self.free_value(quotient); + let should_save_rax = if self.block_state.regs.is_free(RAX) { + false + } else { + true + }; + + if let ValueLocation::Reg(r) = quotient { + self.block_state.regs.mark_used(r); + } + + let saved_rax = if should_save_rax { + let new_reg = self.block_state.regs.take(I32); + dynasm!(self.asm + ; mov Rq(new_reg.rq().unwrap()), rax + ); + Some(new_reg) + } else { + None + }; + + do_div(self, divisor); + + (divisor, ValueLocation::Reg(RAX), saved_rax) + } + + fn i32_full_div_u( + &mut self, + divisor: ValueLocation, + quotient: ValueLocation, + ) -> (ValueLocation, ValueLocation, Option) { + self.i32_full_div(divisor, quotient, |this, divisor| match divisor { + ValueLocation::Stack(offset) => { + let offset = this.adjusted_offset(offset); + dynasm!(this.asm + ; div [rsp + offset] + ); + } + ValueLocation::Reg(r) => { + dynasm!(this.asm + ; div Rq(r.rq().unwrap()) + ); + } + ValueLocation::Immediate(_) => unreachable!(), + }) + } + + fn i32_full_div_s( + &mut self, + divisor: ValueLocation, + quotient: ValueLocation, + ) -> (ValueLocation, ValueLocation, Option) { + self.i32_full_div(divisor, quotient, |this, divisor| match divisor { + ValueLocation::Stack(offset) => { + let offset = this.adjusted_offset(offset); + dynasm!(this.asm + ; idiv [rsp + offset] + ); + } + ValueLocation::Reg(r) => { + dynasm!(this.asm + ; idiv Rq(r.rq().unwrap()) + ); + } + ValueLocation::Immediate(_) => unreachable!(), + }) + } + + // TODO: Fast div using mul for constant divisor? It looks like LLVM doesn't do that for us when + // emitting Wasm. + pub fn i32_div_u(&mut self) { + let divisor = self.pop(); + let quotient = self.pop(); + + if let (Some(quotient), Some(divisor)) = (quotient.imm_i32(), divisor.imm_i32()) { + if divisor == 0 { + self.trap(); + self.push(ValueLocation::Immediate(0u32.into())); + } else { + self.push(ValueLocation::Immediate( + u32::wrapping_div(quotient as _, divisor as _).into(), + )); + } + + return; + } + + let (div, rem, saved_rax) = self.i32_full_div_u(divisor, quotient); + + self.free_value(rem); + + if let Some(saved) = saved_rax { + self.copy_value(&ValueLocation::Reg(saved), &mut ValueLocation::Reg(RAX)); + self.block_state.regs.release(saved); + self.block_state.regs.mark_used(RAX); + } + + self.push(div); + } + + pub fn i32_rem_u(&mut self) { + let divisor = self.pop(); + let quotient = self.pop(); + + if let (Some(quotient), Some(divisor)) = (quotient.imm_i32(), divisor.imm_i32()) { + if divisor == 0 { + self.trap(); + self.push(ValueLocation::Immediate(0u32.into())); + } else { + self.push(ValueLocation::Immediate( + (quotient as u32 % divisor as u32).into(), + )); + } + return; + } + + let (div, rem, saved_rax) = self.i32_full_div_u(divisor, quotient); + + self.free_value(div); + + let rem = if let Some(saved) = saved_rax { + let new_gpr = self.block_state.regs.take(I32); + self.copy_value(&ValueLocation::Reg(RAX), &mut ValueLocation::Reg(new_gpr)); + self.copy_value(&ValueLocation::Reg(saved), &mut ValueLocation::Reg(RAX)); + self.block_state.regs.release(saved); + ValueLocation::Reg(new_gpr) + } else { + rem + }; + + self.push(rem); + } + + pub fn i32_rem_s(&mut self) { + let divisor = self.pop(); + let quotient = self.pop(); + + if let (Some(quotient), Some(divisor)) = (quotient.imm_i32(), divisor.imm_i32()) { + if divisor == 0 { + self.trap(); + self.push(ValueLocation::Immediate(0u32.into())); + } else { + self.push(ValueLocation::Immediate((quotient % divisor).into())); + } + return; + } + + let (div, rem, saved_rax) = self.i32_full_div_s(divisor, quotient); + + self.free_value(div); + + let rem = if let Some(saved) = saved_rax { + let new_gpr = self.block_state.regs.take(I32); + self.copy_value(&ValueLocation::Reg(RAX), &mut ValueLocation::Reg(new_gpr)); + self.copy_value(&ValueLocation::Reg(saved), &mut ValueLocation::Reg(RAX)); + self.block_state.regs.release(saved); + ValueLocation::Reg(new_gpr) + } else { + rem + }; + + self.push(rem); + } + + // TODO: Fast div using mul for constant divisor? It looks like LLVM doesn't do that for us when + // emitting Wasm. + pub fn i32_div_s(&mut self) { + let divisor = self.pop(); + let quotient = self.pop(); + + if let (Some(quotient), Some(divisor)) = (quotient.imm_i32(), divisor.imm_i32()) { + if divisor == 0 { + self.trap(); + self.push(ValueLocation::Immediate(0u32.into())); + } else { + self.push(ValueLocation::Immediate( + i32::wrapping_div(quotient, divisor).into(), + )); + } + + return; + } + + let (div, rem, saved_rax) = self.i32_full_div_s(divisor, quotient); + + self.free_value(rem); + + if let Some(saved) = saved_rax { + self.copy_value(&ValueLocation::Reg(saved), &mut ValueLocation::Reg(RAX)); + self.block_state.regs.release(saved); + self.block_state.regs.mark_used(RAX); + } + + self.push(div); + } + + // `i32_mul` needs to be separate because the immediate form of the instruction + // has a different syntax to the immediate form of the other instructions. + pub fn i32_mul(&mut self) { + let op0 = self.pop(); + let op1 = self.pop(); + + if let Some(i1) = op1.immediate() { + if let Some(i0) = op0.immediate() { + self.push(ValueLocation::Immediate( + i32::wrapping_mul(i1.as_i32().unwrap(), i0.as_i32().unwrap()).into(), + )); + return; + } + } + + let (op1, op0) = match op1 { + ValueLocation::Reg(_) => (self.into_temp_reg(I32, op1), op0), + _ => { + if op0.immediate().is_some() { + (self.into_temp_reg(I32, op1), op0) + } else { + (self.into_temp_reg(I32, op0), op1) + } + } + }; + + match op0 { + ValueLocation::Reg(reg) => { + dynasm!(self.asm + ; imul Rd(op1.rq().unwrap()), Rd(reg.rq().unwrap()) + ); + } + ValueLocation::Stack(offset) => { + let offset = self.adjusted_offset(offset); + dynasm!(self.asm + ; imul Rd(op1.rq().unwrap()), [rsp + offset] + ); + } + ValueLocation::Immediate(i) => { + dynasm!(self.asm + ; imul Rd(op1.rq().unwrap()), Rd(op1.rq().unwrap()), i.as_i32().unwrap() + ); + } + } + + self.push(ValueLocation::Reg(op1)); + self.free_value(op0); + } + + // `i64_mul` needs to be separate because the immediate form of the instruction // has a different syntax to the immediate form of the other instructions. pub fn i64_mul(&mut self) { let op0 = self.pop(); @@ -2201,55 +2466,6 @@ impl Context<'_, M> { self.free_value(op0); } - // `i32_mul` needs to be seperate because the immediate form of the instruction - // has a different syntax to the immediate form of the other instructions. - pub fn i32_mul(&mut self) { - let op0 = self.pop(); - let op1 = self.pop(); - - if let Some(i1) = op1.immediate() { - if let Some(i0) = op0.immediate() { - self.push(ValueLocation::Immediate( - i32::wrapping_mul(i1.as_i32().unwrap(), i0.as_i32().unwrap()).into(), - )); - return; - } - } - - let (op1, op0) = match op1 { - ValueLocation::Reg(_) => (self.into_temp_reg(I32, op1), op0), - _ => { - if op0.immediate().is_some() { - (self.into_temp_reg(I32, op1), op0) - } else { - (self.into_temp_reg(I32, op0), op1) - } - } - }; - - match op0 { - ValueLocation::Reg(reg) => { - dynasm!(self.asm - ; imul Rd(op1.rq().unwrap()), Rd(reg.rq().unwrap()) - ); - } - ValueLocation::Stack(offset) => { - let offset = self.adjusted_offset(offset); - dynasm!(self.asm - ; imul Rd(op1.rq().unwrap()), [rsp + offset] - ); - } - ValueLocation::Immediate(i) => { - dynasm!(self.asm - ; imul Rd(op1.rq().unwrap()), Rd(op1.rq().unwrap()), i.as_i32().unwrap() - ); - } - } - - self.push(ValueLocation::Reg(op1)); - self.free_value(op0); - } - pub fn select(&mut self) { let cond = self.pop(); let else_ = self.pop(); diff --git a/src/function_body.rs b/src/function_body.rs index 07fa1c06bf..1796f2b77f 100644 --- a/src/function_body.rs +++ b/src/function_body.rs @@ -36,7 +36,7 @@ where { let ty = session.module_context.func_type(func_idx); - if true { + if false { let mut microwasm = vec![]; let microwasm_conv = MicrowasmConv::new( @@ -137,7 +137,7 @@ where let block = entry.get_mut(); // TODO: Is it possible with arbitrary CFGs that a block will have _only_ backwards callers? - // Certainly for Wasm that is currently impossible. + // Certainly for Microwasm generated from Wasm that is currently impossible. if block.actual_num_callers == 0 { loop { let done = match body.peek() { @@ -245,8 +245,10 @@ where } } Operator::BrIf { then, else_ } => { - // TODO: We should add the block to the hashmap if we don't have it already let (then_block, else_block) = blocks.pair_mut(&then, &else_); + // TODO: If actual_num_callers == num_callers then we can remove this block from the hashmap. + // This frees memory and acts as a kind of verification that `num_callers` is set + // correctly. It doesn't help for loops and block ends generated from Wasm. then_block.actual_num_callers += 1; else_block.actual_num_callers += 1; @@ -379,6 +381,10 @@ where Operator::Or(Size::_32) => ctx.i32_or(), Operator::Xor(Size::_32) => ctx.i32_xor(), Operator::Mul(I32) => ctx.i32_mul(), + Operator::Div(SU32) => ctx.i32_div_u(), + Operator::Div(SI32) => ctx.i32_div_s(), + Operator::Rem(sint::I32) => ctx.i32_rem_u(), + Operator::Rem(sint::U32) => ctx.i32_rem_s(), Operator::Shl(Size::_32) => ctx.i32_shl(), Operator::Shr(sint::I32) => ctx.i32_shr_s(), Operator::Shr(sint::U32) => ctx.i32_shr_u(),