diff --git a/src/backend.rs b/src/backend.rs
index 09d7140648..6b7f47a8f5 100644
--- a/src/backend.rs
+++ b/src/backend.rs
@@ -46,7 +46,7 @@ const NUM_GPRS: u8 = 16;
 impl GPRs {
     fn take(&mut self) -> GPR {
         let lz = self.bits.trailing_zeros();
-        assert!(lz < 16, "ran out of free GPRs");
+        debug_assert!(lz < 16, "ran out of free GPRs");
         let gpr = lz as GPR;
         self.mark_used(gpr);
         gpr
@@ -57,7 +57,7 @@ impl GPRs {
     }
 
     fn release(&mut self, gpr: GPR) {
-        assert!(!self.is_free(gpr), "released register was already free",);
+        debug_assert!(!self.is_free(gpr), "released register was already free",);
         self.bits |= 1 << gpr;
     }
 
@@ -129,18 +129,13 @@ enum ValueLocation {
     /// before reading (as RSP may have been changed by `push`/`pop`).
     Stack(i32),
     /// Value is a literal (TODO: Support more than just `i32`)
-    Immediate(i32),
+    Immediate(i64),
 }
 
 // TODO: This assumes only system-v calling convention.
 // In system-v calling convention the first 6 arguments are passed via registers.
 // All rest arguments are passed on the stack.
 const ARGS_IN_GPRS: &[GPR] = &[RDI, RSI, RDX, RCX, R8, R9];
-// RAX is reserved for return values. In the future we want a system to allow
-// use of specific registers by saving/restoring them. This would allow using
-// RAX as a scratch register when we're not calling a function, and would also
-// allow us to call instructions that require specific registers.
-//
 // List of scratch registers taken from https://wiki.osdev.org/System_V_ABI
 const SCRATCH_REGS: &[GPR] = &[RAX, R10, R11];
 
@@ -166,7 +161,7 @@ impl CodeGenSession {
         {
             let func_start = &mut self.func_starts[func_idx as usize];
 
-            // At this point we now the exact start address of this function. Save it
+            // At this point we know the exact start address of this function. Save it
             // and define dynamic label at this location.
             func_start.0 = Some(self.assembler.offset());
             self.assembler.dynamic_label(func_start.1);
@@ -217,11 +212,11 @@ impl TranslatedCodeSection {
 enum Value {
     Local(u32),
     Temp(GPR),
-    Immediate(i32),
+    Immediate(i64),
 }
 
 impl Value {
-    fn immediate(&self) -> Option<i32> {
+    fn immediate(&self) -> Option<i64> {
         match *self {
             Value::Immediate(i) => Some(i),
             _ => None,
@@ -241,7 +236,7 @@ impl Value {
 enum StackValue {
     Local(u32),
     Temp(GPR),
-    Immediate(i32),
+    Immediate(i64),
     Pop,
 }
 
@@ -480,7 +475,7 @@ pub fn reset_block(ctx: &mut Context, parent_block_state: BlockState) {
 pub fn end_block(ctx: &mut Context, parent_block_state: BlockState) {
     // TODO: This should currently never be called, but is important for if we want to
     //       have a more complex stack spilling scheme.
-    assert_eq!(
+    debug_assert_eq!(
         ctx.block_state.depth, parent_block_state.depth,
         "Imbalanced pushes and pops"
     );
@@ -499,15 +494,6 @@ pub fn end_block(ctx: &mut Context, parent_block_state: BlockState) {
     }
 }
 
-fn push_return_value(ctx: &mut Context, arity: u32) {
-    if arity == 0 {
-        return;
-    }
-    assert_eq!(arity, 1);
-    ctx.block_state.regs.mark_used(RAX);
-    ctx.block_state.stack.push(StackValue::Temp(RAX));
-}
-
 fn restore_locals(ctx: &mut Context) {
     for (src, dst) in ctx
         .block_state
@@ -521,7 +507,7 @@ fn restore_locals(ctx: &mut Context) {
     }
 }
 
-fn push_i32(ctx: &mut Context, value: Value) {
+fn push(ctx: &mut Context, value: Value) {
     let stack_loc = match value {
         Value::Local(loc) => StackValue::Local(loc),
         Value::Immediate(i) => StackValue::Immediate(i),
@@ -543,7 +529,7 @@ fn push_i32(ctx: &mut Context, value: Value) {
     ctx.block_state.stack.push(stack_loc);
 }
 
-fn pop_i32(ctx: &mut Context) -> Value {
+fn pop(ctx: &mut Context) -> Value {
     match ctx.block_state.stack.pop().expect("Stack is empty") {
         StackValue::Local(loc) => Value::Local(loc),
         StackValue::Immediate(i) => Value::Immediate(i),
@@ -607,7 +593,7 @@ pub fn drop(ctx: &mut Context) {
     }
 }
 
-fn pop_i32_into(ctx: &mut Context, dst: ValueLocation) {
+fn pop_into(ctx: &mut Context, dst: ValueLocation) {
     let val = ctx.block_state.stack.pop().expect("Stack is empty");
     put_stack_val_into(ctx, val, dst);
 }
@@ -632,9 +618,7 @@ fn into_reg(ctx: &mut Context, val: Value) -> GPR {
         }
         ValueLocation::Immediate(i) => {
             let scratch = ctx.block_state.regs.take_scratch_gpr();
-            dynasm!(ctx.asm
-                ; mov Rq(scratch), i
-            );
+            immediate_to_reg(ctx, scratch, i);
             scratch
         }
         ValueLocation::Reg(reg) => reg,
@@ -670,9 +654,7 @@ fn into_temp_reg(ctx: &mut Context, val: Value) -> GPR {
         Value::Immediate(i) => {
             let scratch = ctx.block_state.regs.take_scratch_gpr();
 
-            dynasm!(ctx.asm
-                ; mov Rq(scratch), i
-            );
+            immediate_to_reg(ctx, scratch, i);
 
             scratch
         }
@@ -680,15 +662,15 @@ fn into_temp_reg(ctx: &mut Context, val: Value) -> GPR {
     }
 }
 
-macro_rules! commutative_binop {
+macro_rules! commutative_binop_i32 {
     ($name:ident, $instr:ident, $const_fallback:expr) => {
         pub fn $name(ctx: &mut Context) {
-            let op0 = pop_i32(ctx);
-            let op1 = pop_i32(ctx);
+            let op0 = pop(ctx);
+            let op1 = pop(ctx);
 
             if let Some(i1) = op1.immediate() {
                 if let Some(i0) = op0.immediate() {
-                    ctx.block_state.stack.push(StackValue::Immediate($const_fallback(i1, i0)));
+                    ctx.block_state.stack.push(StackValue::Immediate($const_fallback(i1 as i32, i0 as i32) as _));
                     return;
                 }
             }
@@ -716,7 +698,7 @@ macro_rules! commutative_binop {
                 }
                 ValueLocation::Immediate(i) => {
                     dynasm!(ctx.asm
-                        ; $instr Rd(op1), i
+                        ; $instr Rd(op1), i as i32
                     );
                 }
             }
@@ -727,22 +709,78 @@ macro_rules! commutative_binop {
     }
 }
 
-commutative_binop!(i32_add, add, i32::wrapping_add);
-commutative_binop!(i32_and, and, |a, b| a & b);
-commutative_binop!(i32_or, or, |a, b| a | b);
-commutative_binop!(i32_xor, xor, |a, b| a ^ b);
+macro_rules! commutative_binop_i64 {
+    ($name:ident, $instr:ident, $const_fallback:expr) => {
+        pub fn $name(ctx: &mut Context) {
+            let op0 = pop(ctx);
+            let op1 = pop(ctx);
+
+            if let Some(i1) = op1.immediate() {
+                if let Some(i0) = op0.immediate() {
+                    ctx.block_state.stack.push(StackValue::Immediate($const_fallback(i1, i0)));
+                    return;
+                }
+            }
+
+            let (op1, op0) = match op1 {
+                Value::Temp(reg) => (reg, op0),
+                _ => if op0.immediate().is_some() {
+                    (into_temp_reg(ctx, op1), op0)
+                } else {
+                    (into_temp_reg(ctx, op0), op1)
+                }
+            };
+
+            match op0.location(&ctx.block_state.locals) {
+                ValueLocation::Reg(reg) => {
+                    dynasm!(ctx.asm
+                        ; $instr Rq(op1), Rq(reg)
+                    );
+                }
+                ValueLocation::Stack(offset) => {
+                    let offset = adjusted_offset(ctx, offset);
+                    dynasm!(ctx.asm
+                        ; $instr Rq(op1), [rsp + offset]
+                    );
+                }
+                ValueLocation::Immediate(i) => {
+                    if (i as u64) <= u32::max_value() as u64 {
+                        dynasm!(ctx.asm
+                            ; $instr Rq(op1), i as i32
+                        );
+                    } else {
+                        unimplemented!(concat!("Unsupported `", stringify!($instr), "` with large 64-bit immediate operand"));
+                    }
+                }
+            }
+
+            ctx.block_state.stack.push(StackValue::Temp(op1));
+            free_value(ctx, op0);
+        }
+    }
+}
+// TODO: Use `inc`/`dec` where possible?
+commutative_binop_i32!(i32_add, add, |a, b| (a as i32).wrapping_add(b as i32));
+commutative_binop_i32!(i32_and, and, |a, b| a & b);
+commutative_binop_i32!(i32_or, or, |a, b| a | b);
+commutative_binop_i32!(i32_xor, xor, |a, b| a ^ b);
+
+commutative_binop_i64!(i64_add, add, i64::wrapping_add);
+commutative_binop_i64!(i64_and, and, |a, b| a & b);
+commutative_binop_i64!(i64_or, or, |a, b| a | b);
+commutative_binop_i64!(i64_xor, xor, |a, b| a ^ b);
 
 // `i32_mul` needs to be seperate because the immediate form of the instruction
 // has a different syntax to the immediate form of the other instructions.
 pub fn i32_mul(ctx: &mut Context) {
-    let op0 = pop_i32(ctx);
-    let op1 = pop_i32(ctx);
+    let op0 = pop(ctx);
+    let op1 = pop(ctx);
 
     if let Some(i1) = op1.immediate() {
         if let Some(i0) = op0.immediate() {
-            ctx.block_state
-                .stack
-                .push(StackValue::Immediate(i32::wrapping_mul(i1, i0)));
+            ctx.block_state.stack.push(StackValue::Immediate(
+                i32::wrapping_mul(i1 as i32, i0 as i32) as _,
+            ));
             return;
         }
     }
@@ -772,7 +810,7 @@ pub fn i32_mul(ctx: &mut Context) {
         }
         ValueLocation::Immediate(i) => {
             dynasm!(ctx.asm
-                ; imul Rd(op1), Rd(op1), i
+                ; imul Rd(op1), Rd(op1), i as i32
             );
         }
     }
@@ -781,11 +819,109 @@ pub fn i32_mul(ctx: &mut Context) {
     free_value(ctx, op0);
 }
 
+// `sub` is not commutative, so we have to handle it differently (we _must_ use the `op1`
+// temp register as the output)
+pub fn i64_sub(ctx: &mut Context) {
+    let op0 = pop(ctx);
+    let op1 = pop(ctx);
+
+    if let Some(i1) = op1.immediate() {
+        if let Some(i0) = op0.immediate() {
+            ctx.block_state.stack.push(StackValue::Immediate(i1 - i0));
+            return;
+        }
+    }
+
+    let op1 = into_temp_reg(ctx, op1);
+    match op0.location(&ctx.block_state.locals) {
+        ValueLocation::Reg(reg) => {
+            dynasm!(ctx.asm
+                ; sub Rq(op1), Rq(reg)
+            );
+        }
+        ValueLocation::Stack(offset) => {
+            let offset = adjusted_offset(ctx, offset);
+            dynasm!(ctx.asm
+                ; sub Rq(op1), [rsp + offset]
+            );
+        }
+        ValueLocation::Immediate(i) => {
+            if (i as u64) <= u32::max_value() as u64 {
+                dynasm!(ctx.asm
+                    ; sub Rq(op1), i as i32
+                );
+            } else {
+                unimplemented!(concat!(
+                    "Unsupported `sub` with large 64-bit immediate operand"
+                ));
+            }
+        }
+    }
+
+    ctx.block_state.stack.push(StackValue::Temp(op1));
+    free_value(ctx, op0);
+}
+
+// `i64_mul` needs to be seperate because the immediate form of the instruction
+// has a different syntax to the immediate form of the other instructions.
+pub fn i64_mul(ctx: &mut Context) {
+    let op0 = pop(ctx);
+    let op1 = pop(ctx);
+
+    if let Some(i1) = op1.immediate() {
+        if let Some(i0) = op0.immediate() {
+            ctx.block_state
+                .stack
+                .push(StackValue::Immediate(i64::wrapping_mul(i1, i0)));
+            return;
+        }
+    }
+
+    let (op1, op0) = match op1 {
+        Value::Temp(reg) => (reg, op0),
+        _ => {
+            if op0.immediate().is_some() {
+                (into_temp_reg(ctx, op1), op0)
+            } else {
+                (into_temp_reg(ctx, op0), op1)
+            }
+        }
+    };
+
+    match op0.location(&ctx.block_state.locals) {
+        ValueLocation::Reg(reg) => {
+            dynasm!(ctx.asm
+                ; imul Rq(op1), Rq(reg)
+            );
+        }
+        ValueLocation::Stack(offset) => {
+            let offset = adjusted_offset(ctx, offset);
+            dynasm!(ctx.asm
+                ; imul Rq(op1), [rsp + offset]
+            );
+        }
+        ValueLocation::Immediate(i) => {
+            if (i as u64) <= u32::max_value() as u64 {
+                dynasm!(ctx.asm
+                    ; imul Rq(op1), Rq(op1), i as i32
+                );
+            } else {
+                unimplemented!(concat!(
+                    "Unsupported `imul` with large 64-bit immediate operand"
+                ));
+            }
+        }
+    }
+
+    ctx.block_state.stack.push(StackValue::Temp(op1));
+    free_value(ctx, op0);
+}
+
 // `sub` is not commutative, so we have to handle it differently (we _must_ use the `op1`
 // temp register as the output)
 pub fn i32_sub(ctx: &mut Context) {
-    let op0 = pop_i32(ctx);
-    let op1 = pop_i32(ctx);
+    let op0 = pop(ctx);
+    let op1 = pop(ctx);
 
     if let Some(i1) = op1.immediate() {
         if let Some(i0) = op0.immediate() {
@@ -809,7 +945,7 @@ pub fn i32_sub(ctx: &mut Context) {
         }
         ValueLocation::Immediate(i) => {
             dynasm!(ctx.asm
-                ; sub Rd(op1), i
+                ; sub Rd(op1), i as i32
             );
         }
     }
@@ -819,13 +955,13 @@ pub fn i32_sub(ctx: &mut Context) {
 }
 
 pub fn get_local_i32(ctx: &mut Context, local_idx: u32) {
-    push_i32(ctx, Value::Local(local_idx));
+    push(ctx, Value::Local(local_idx));
 }
 
 // TODO: We can put locals that were spilled to the stack
 //       back into registers here.
 pub fn set_local_i32(ctx: &mut Context, local_idx: u32) {
-    let val = pop_i32(ctx);
+    let val = pop(ctx);
     let val_loc = val.location(&ctx.block_state.locals);
     let dst_loc = ctx.block_state.parent_locals.get(local_idx);
 
@@ -884,14 +1020,18 @@ fn materialize_local(ctx: &mut Context, local_idx: u32) {
 }
 
 pub fn literal_i32(ctx: &mut Context, imm: i32) {
-    push_i32(ctx, Value::Immediate(imm));
+    push(ctx, Value::Immediate(imm as _));
 }
 
-macro_rules! cmp {
+pub fn literal_i64(ctx: &mut Context, imm: i64) {
+    push(ctx, Value::Immediate(imm));
+}
+
+macro_rules! cmp_i32 {
     ($name:ident, $instr:ident, $const_fallback:expr) => {
         pub fn $name(ctx: &mut Context) {
-            let right = pop_i32(ctx);
-            let left = pop_i32(ctx);
+            let right = pop(ctx);
+            let left = pop(ctx);
 
             let out = if let Some(i) = left.immediate() {
                 match right.location(&ctx.block_state.locals) {
@@ -899,8 +1039,8 @@ macro_rules! cmp {
                         let result = ctx.block_state.regs.take_scratch_gpr();
                         let offset = adjusted_offset(ctx, offset);
                         dynasm!(ctx.asm
-                            ; xor Rq(result), Rq(result)
-                            ; cmp DWORD [rsp + offset], i
+                            ; xor Rd(result), Rd(result)
+                            ; cmp DWORD [rsp + offset], i as i32
                             ; $instr Rb(result)
                         );
                         Value::Temp(result)
@@ -908,8 +1048,78 @@ macro_rules! cmp {
                     ValueLocation::Reg(rreg) => {
                         let result = ctx.block_state.regs.take_scratch_gpr();
                         dynasm!(ctx.asm
-                            ; xor Rq(result), Rq(result)
-                            ; cmp Rd(rreg), i
+                            ; xor Rd(result), Rd(result)
+                            ; cmp Rd(rreg), i as i32
+                            ; $instr Rb(result)
+                        );
+                        Value::Temp(result)
+                    }
+                    ValueLocation::Immediate(right) => {
+                        Value::Immediate(if $const_fallback(i as i32, right as i32) { 1 } else { 0 })
+                    }
+                }
+            } else {
+                let lreg = into_reg(ctx, left);
+                let result = ctx.block_state.regs.take_scratch_gpr();
+
+                match right.location(&ctx.block_state.locals) {
+                    ValueLocation::Stack(offset) => {
+                        let offset = adjusted_offset(ctx, offset);
+                        dynasm!(ctx.asm
+                            ; xor Rd(result), Rd(result)
+                            ; cmp Rd(lreg), [rsp + offset]
+                            ; $instr Rb(result)
+                        );
+                    }
+                    ValueLocation::Reg(rreg) => {
+                        dynasm!(ctx.asm
+                            ; xor Rd(result), Rd(result)
+                            ; cmp Rd(lreg), Rd(rreg)
+                            ; $instr Rb(result)
+                        );
+                    }
+                    ValueLocation::Immediate(i) => {
+                        dynasm!(ctx.asm
+                            ; xor Rd(result), Rd(result)
+                            ; cmp Rd(lreg), i as i32
+                            ; $instr Rb(result)
+                        );
+                    }
+                }
+
+                Value::Temp(result)
+            };
+
+            push(ctx, out);
+            free_value(ctx, left);
+            free_value(ctx, right);
+        }
+    }
+}
+
+macro_rules! cmp_i64 {
+    ($name:ident, $instr:ident, $const_fallback:expr) => {
+        pub fn $name(ctx: &mut Context) {
+            let right = pop(ctx);
+            let left = pop(ctx);
+
+            let out = if let Some(i) = left.immediate() {
+                match right.location(&ctx.block_state.locals) {
+                    ValueLocation::Stack(offset) => {
+                        let result = ctx.block_state.regs.take_scratch_gpr();
+                        let offset = adjusted_offset(ctx, offset);
+                        dynasm!(ctx.asm
+                            ; xor Rd(result), Rd(result)
+                            ; cmp QWORD [rsp + offset], i as i32
+                            ; $instr Rb(result)
+                        );
+                        Value::Temp(result)
+                    }
+                    ValueLocation::Reg(rreg) => {
+                        let result = ctx.block_state.regs.take_scratch_gpr();
+                        dynasm!(ctx.asm
+                            ; xor Rd(result), Rd(result)
+                            ; cmp Rq(rreg), i as i32
                             ; $instr Rb(result)
                         );
                         Value::Temp(result)
@@ -926,53 +1136,69 @@ macro_rules! cmp {
                     ValueLocation::Stack(offset) => {
                         let offset = adjusted_offset(ctx, offset);
                         dynasm!(ctx.asm
-                            ; xor Rq(result), Rq(result)
-                            ; cmp Rd(lreg), [rsp + offset]
+                            ; xor Rd(result), Rd(result)
+                            ; cmp Rq(lreg), [rsp + offset]
                             ; $instr Rb(result)
                         );
                     }
                     ValueLocation::Reg(rreg) => {
                         dynasm!(ctx.asm
-                            ; xor Rq(result), Rq(result)
-                            ; cmp Rd(lreg), Rd(rreg)
+                            ; xor Rd(result), Rd(result)
+                            ; cmp Rq(lreg), Rq(rreg)
                             ; $instr Rb(result)
                         );
                     }
                     ValueLocation::Immediate(i) => {
-                        dynasm!(ctx.asm
-                            ; xor Rq(result), Rq(result)
-                            ; cmp Rd(lreg), i
-                            ; $instr Rb(result)
-                        );
+                        if (i as u64) <= u32::max_value() as u64 {
+                            dynasm!(ctx.asm
+                                ; xor Rd(result), Rd(result)
+                                ; cmp Rq(lreg), i as i32
+                                ; $instr Rb(result)
+                            );
+                        } else {
+                            unimplemented!("Have yet to implement `cmp` with imm64 operand");
+                        }
                     }
                 }
 
                 Value::Temp(result)
             };
 
-            push_i32(ctx, out);
+            push(ctx, out);
             free_value(ctx, left);
             free_value(ctx, right);
         }
     }
 }
 
-cmp!(i32_eq, sete, |a, b| a == b);
-cmp!(i32_neq, setne, |a, b| a != b);
+cmp_i32!(i32_eq, sete, |a, b| a == b);
+cmp_i32!(i32_neq, setne, |a, b| a != b);
 // `dynasm-rs` inexplicably doesn't support setb but `setnae` (and `setc`) are synonymous
-cmp!(i32_lt_u, setnae, |a, b| (a as u32) < (b as u32));
-cmp!(i32_le_u, setbe, |a, b| (a as u32) <= (b as u32));
-cmp!(i32_gt_u, seta, |a, b| (a as u32) > (b as u32));
-cmp!(i32_ge_u, setae, |a, b| (a as u32) >= (b as u32));
-cmp!(i32_lt_s, setl, |a, b| a < b);
-cmp!(i32_le_s, setle, |a, b| a <= b);
-cmp!(i32_gt_s, setg, |a, b| a == b);
-cmp!(i32_ge_s, setge, |a, b| a == b);
+cmp_i32!(i32_lt_u, setnae, |a, b| (a as u32) < (b as u32));
+cmp_i32!(i32_le_u, setbe, |a, b| (a as u32) <= (b as u32));
+cmp_i32!(i32_gt_u, seta, |a, b| (a as u32) > (b as u32));
+cmp_i32!(i32_ge_u, setae, |a, b| (a as u32) >= (b as u32));
+cmp_i32!(i32_lt_s, setl, |a, b| a < b);
+cmp_i32!(i32_le_s, setle, |a, b| a <= b);
+cmp_i32!(i32_gt_s, setg, |a, b| a == b);
+cmp_i32!(i32_ge_s, setge, |a, b| a == b);
+
+cmp_i64!(i64_eq, sete, |a, b| a == b);
+cmp_i64!(i64_neq, setne, |a, b| a != b);
+// `dynasm-rs` inexplicably doesn't support setb but `setnae` (and `setc`) are synonymous
+cmp_i64!(i64_lt_u, setnae, |a, b| (a as u64) < (b as u64));
+cmp_i64!(i64_le_u, setbe, |a, b| (a as u64) <= (b as u64));
+cmp_i64!(i64_gt_u, seta, |a, b| (a as u64) > (b as u64));
+cmp_i64!(i64_ge_u, setae, |a, b| (a as u64) >= (b as u64));
+cmp_i64!(i64_lt_s, setl, |a, b| a < b);
+cmp_i64!(i64_le_s, setle, |a, b| a <= b);
+cmp_i64!(i64_gt_s, setg, |a, b| a == b);
+cmp_i64!(i64_ge_s, setge, |a, b| a == b);
 
 /// Pops i32 predicate and branches to the specified label
 /// if the predicate is equal to zero.
 pub fn jump_if_false(ctx: &mut Context, label: Label) {
-    let val = pop_i32(ctx);
+    let val = pop(ctx);
     let predicate = into_temp_reg(ctx, val);
     dynasm!(ctx.asm
         ; test Rd(predicate), Rd(predicate)
@@ -988,6 +1214,20 @@ pub fn br(ctx: &mut Context, label: Label) {
     );
 }
 
+fn immediate_to_reg(ctx: &mut Context, reg: GPR, val: i64) {
+    if (val as u64) <= u32::max_value() as u64 {
+        dynasm!(ctx.asm
+            ; mov Rd(reg), val as i32
+        );
+    } else if reg == RAX {
+        dynasm!(ctx.asm
+            ; movabs rax, val
+        );
+    } else {
+        unimplemented!("dynasm doesn't yet support mov r64, imm64");
+    }
+}
+
 fn copy_value(ctx: &mut Context, src: ValueLocation, dst: ValueLocation) {
     match (src, dst) {
         (ValueLocation::Stack(in_offset), ValueLocation::Stack(out_offset)) => {
@@ -1011,8 +1251,14 @@ fn copy_value(ctx: &mut Context, src: ValueLocation, dst: ValueLocation) {
         (ValueLocation::Immediate(i), ValueLocation::Stack(out_offset)) => {
             let out_offset = adjusted_offset(ctx, out_offset);
             dynasm!(ctx.asm
-                ; mov DWORD [rsp + out_offset], i
+                ; mov DWORD [rsp + out_offset], i as i32
             );
+            if (i as u64) > u32::max_value() as u64 {
+                let i = (i >> 4) as i32;
+                dynasm!(ctx.asm
+                    ; mov DWORD [rsp + out_offset + 4], i
+                );
+            }
         }
         (ValueLocation::Stack(in_offset), ValueLocation::Reg(out_reg)) => {
             let in_offset = adjusted_offset(ctx, in_offset);
@@ -1028,9 +1274,7 @@ fn copy_value(ctx: &mut Context, src: ValueLocation, dst: ValueLocation) {
             }
         }
         (ValueLocation::Immediate(i), ValueLocation::Reg(out_reg)) => {
-            dynasm!(ctx.asm
-                ; mov Rq(out_reg), i
-            );
+            immediate_to_reg(ctx, out_reg, i);
         }
         // TODO: Have separate `ReadLocation` and `WriteLocation`?
         (_, ValueLocation::Immediate(_)) => panic!("Tried to copy to an immediate value!"),
@@ -1121,7 +1365,7 @@ fn free_register(ctx: &mut Context, reg: GPR) {
                 //       don't have to check this at all (i.e. order on the
                 //       physical stack and order on the logical stack should
                 //       be independent).
-                assert_eq!(to_repush, 0);
+                debug_assert_eq!(to_repush, 0);
                 dynasm!(ctx.asm
                     ; push Rq(reg)
                 );
@@ -1180,7 +1424,7 @@ fn pass_outgoing_args(ctx: &mut Context, arity: u32, return_arity: u32) -> CallC
             // `AbsoluteValueLocation` and `RelativeValueLocation`.
             let offset =
                 stack_slot * WORD_SIZE as i32 - ctx.block_state.depth.0 as i32 * WORD_SIZE as i32;
-            pop_i32_into(ctx, ValueLocation::Stack(offset));
+            pop_into(ctx, ValueLocation::Stack(offset));
         }
     }
 
@@ -1188,7 +1432,7 @@ fn pass_outgoing_args(ctx: &mut Context, arity: u32, return_arity: u32) -> CallC
         .iter()
         .rev()
     {
-        pop_i32_into(ctx, ValueLocation::Reg(*reg));
+        pop_into(ctx, ValueLocation::Reg(*reg));
     }
 
     // We do this before doing `save_volatile`, since otherwise we'll trample the return value
@@ -1220,9 +1464,18 @@ fn post_call_cleanup(ctx: &mut Context, mut cleanup: CallCleanup) {
     }
 }
 
+fn push_function_return(ctx: &mut Context, arity: u32) {
+    if arity == 0 {
+        return;
+    }
+    debug_assert_eq!(arity, 1);
+    ctx.block_state.regs.mark_used(RAX);
+    ctx.block_state.stack.push(StackValue::Temp(RAX));
+}
+
 /// Call a function with the given index
 pub fn call_direct(ctx: &mut Context, index: u32, arg_arity: u32, return_arity: u32) {
-    assert!(
+    debug_assert!(
         return_arity == 0 || return_arity == 1,
         "We don't support multiple return yet"
     );
@@ -1235,7 +1488,7 @@ pub fn call_direct(ctx: &mut Context, index: u32, arg_arity: u32, return_arity:
     );
 
     post_call_cleanup(ctx, cleanup);
-    push_return_value(ctx, return_arity);
+    push_function_return(ctx, return_arity);
 }
 
 #[must_use]
diff --git a/src/function_body.rs b/src/function_body.rs
index b3a98aedda..92f6dbee72 100644
--- a/src/function_body.rs
+++ b/src/function_body.rs
@@ -134,8 +134,7 @@ pub fn translate(
     ));
 
     // TODO: We want to make this a state machine (maybe requires 1-element lookahead? Not sure) so that we
-    //       can coelesce multiple `end`s and optimise break-at-end-of-block into noop. We can't do one
-    //       without the other, since the main case we want to optimise is `(block (loop (br 1)))`.
+    //       can coelesce multiple `end`s and optimise break-at-end-of-block into noop.
     for op in operators {
         let op = op?;
 
@@ -311,6 +310,22 @@ pub fn translate(
             Operator::I32Or => i32_or(ctx),
             Operator::I32Xor => i32_xor(ctx),
             Operator::I32Mul => i32_mul(ctx),
+            Operator::I64Eq => i64_eq(ctx),
+            Operator::I64Ne => i64_neq(ctx),
+            Operator::I64LtS => i64_lt_s(ctx),
+            Operator::I64LeS => i64_le_s(ctx),
+            Operator::I64GtS => i64_gt_s(ctx),
+            Operator::I64GeS => i64_ge_s(ctx),
+            Operator::I64LtU => i64_lt_u(ctx),
+            Operator::I64LeU => i64_le_u(ctx),
+            Operator::I64GtU => i64_gt_u(ctx),
+            Operator::I64GeU => i64_ge_u(ctx),
+            Operator::I64Add => i64_add(ctx),
+            Operator::I64Sub => i64_sub(ctx),
+            Operator::I64And => i64_and(ctx),
+            Operator::I64Or => i64_or(ctx),
+            Operator::I64Xor => i64_xor(ctx),
+            Operator::I64Mul => i64_mul(ctx),
             Operator::Drop => drop(ctx),
             Operator::SetLocal { local_index } => set_local_i32(ctx, local_index),
             Operator::GetLocal { local_index } => get_local_i32(ctx, local_index),
diff --git a/src/tests.rs b/src/tests.rs
index 9045db20a3..6d7d515e76 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -18,40 +18,88 @@ fn empty() {
     let _ = translate_wat("(module (func))");
 }
 
-macro_rules! binop_test {
-    ($op:ident, $func:expr) => {
-        quickcheck! {
-            fn $op(a: u32, b: u32) -> bool {
-                static CODE: &str = concat!(
-                    "(module (func (param i32) (param i32) (result i32) (i32.",
-                    stringify!($op),
-                    " (get_local 0) (get_local 1))))"
-                );
+mod op32 {
+    use super::{translate_wat, TranslatedModule};
 
-                lazy_static! {
-                    static ref TRANSLATED: TranslatedModule = translate_wat(CODE);
+    macro_rules! binop_test {
+        ($op:ident, $func:expr) => {
+            quickcheck! {
+                fn $op(a: u32, b: u32) -> bool {
+                    static CODE: &str = concat!(
+                        "(module (func (param i32) (param i32) (result i32) (i32.",
+                        stringify!($op),
+                        " (get_local 0) (get_local 1))))"
+                    );
+
+                    lazy_static! {
+                        static ref TRANSLATED: TranslatedModule = translate_wat(CODE);
+                    }
+
+                    unsafe { TRANSLATED.execute_func::<(u32, u32), u32>(0, (a, b)) == $func(a, b) }
                 }
-
-                unsafe { TRANSLATED.execute_func::<(u32, u32), u32>(0, (a, b)) == $func(a, b) }
             }
-        }
-    };
+        };
+    }
+
+    binop_test!(add, u32::wrapping_add);
+    binop_test!(sub, u32::wrapping_sub);
+    binop_test!(and, std::ops::BitAnd::bitand);
+    binop_test!(or, std::ops::BitOr::bitor);
+    binop_test!(xor, std::ops::BitXor::bitxor);
+    binop_test!(mul, u32::wrapping_mul);
+    binop_test!(lt_u, |a, b| if a < b { 1 } else { 0 });
+    binop_test!(le_u, |a, b| if a <= b { 1 } else { 0 });
+    binop_test!(gt_u, |a, b| if a > b { 1 } else { 0 });
+    binop_test!(ge_u, |a, b| if a >= b { 1 } else { 0 });
+    binop_test!(lt_s, |a, b| if (a as i32) < (b as i32) { 1 } else { 0 });
+    binop_test!(le_s, |a, b| if (a as i32) <= (b as i32) { 1 } else { 0 });
+    binop_test!(gt_s, |a, b| if (a as i32) > (b as i32) { 1 } else { 0 });
+    binop_test!(ge_s, |a, b| if (a as i32) >= (b as i32) { 1 } else { 0 });
 }
 
-binop_test!(add, u32::wrapping_add);
-binop_test!(sub, u32::wrapping_sub);
-binop_test!(and, std::ops::BitAnd::bitand);
-binop_test!(or, std::ops::BitOr::bitor);
-binop_test!(xor, std::ops::BitXor::bitxor);
-binop_test!(mul, u32::wrapping_mul);
-binop_test!(lt_u, |a, b| if a < b { 1 } else { 0 });
-binop_test!(le_u, |a, b| if a <= b { 1 } else { 0 });
-binop_test!(gt_u, |a, b| if a > b { 1 } else { 0 });
-binop_test!(ge_u, |a, b| if a >= b { 1 } else { 0 });
-binop_test!(lt_s, |a, b| if (a as i32) < (b as i32) { 1 } else { 0 });
-binop_test!(le_s, |a, b| if (a as i32) <= (b as i32) { 1 } else { 0 });
-binop_test!(gt_s, |a, b| if (a as i32) > (b as i32) { 1 } else { 0 });
-binop_test!(ge_s, |a, b| if (a as i32) >= (b as i32) { 1 } else { 0 });
+mod op64 {
+    use super::{translate_wat, TranslatedModule};
+
+    macro_rules! binop_test {
+        ($op:ident, $func:expr) => {
+            binop_test!($op, $func, i64);
+        };
+        ($op:ident, $func:expr, $retty:ident) => {
+            quickcheck! {
+                fn $op(a: u64, b: u64) -> bool {
+                    static CODE: &str = concat!(
+                        "(module (func (param i64) (param i64) (result ",
+                        stringify!($retty),
+                        ") (i64.",
+                        stringify!($op),
+                        " (get_local 0) (get_local 1))))"
+                    );
+
+                    lazy_static! {
+                        static ref TRANSLATED: TranslatedModule = translate_wat(CODE);
+                    }
+
+                    unsafe { TRANSLATED.execute_func::<(u64, u64), u64>(0, (a, b)) == $func(a, b) }
+                }
+            }
+        };
+    }
+
+    binop_test!(add, u64::wrapping_add);
+    binop_test!(sub, u64::wrapping_sub);
+    binop_test!(and, std::ops::BitAnd::bitand);
+    binop_test!(or, std::ops::BitOr::bitor);
+    binop_test!(xor, std::ops::BitXor::bitxor);
+    binop_test!(mul, u64::wrapping_mul);
+    binop_test!(lt_u, |a, b| if a < b { 1 } else { 0 }, i32);
+    binop_test!(le_u, |a, b| if a <= b { 1 } else { 0 }, i32);
+    binop_test!(gt_u, |a, b| if a > b { 1 } else { 0 }, i32);
+    binop_test!(ge_u, |a, b| if a >= b { 1 } else { 0 }, i32);
+    binop_test!(lt_s, |a, b| if (a as i64) < (b as i64) { 1 } else { 0 }, i32);
+    binop_test!(le_s, |a, b| if (a as i64) <= (b as i64) { 1 } else { 0 }, i32);
+    binop_test!(gt_s, |a, b| if (a as i64) > (b as i64) { 1 } else { 0 }, i32);
+    binop_test!(ge_s, |a, b| if (a as i64) >= (b as i64) { 1 } else { 0 }, i32);
+}
 
 quickcheck! {
     fn relop_eq(a: u32, b: u32) -> bool{
@@ -571,16 +619,29 @@ fn fib() {
 }
 
 #[bench]
-fn bench_compile(b: &mut test::Bencher) {
+fn bench_fibonacci_compile(b: &mut test::Bencher) {
     let wasm = wabt::wat2wasm(FIBONACCI).unwrap();
 
     b.iter(|| test::black_box(translate(&wasm).unwrap()));
 }
 
 #[bench]
-fn bench_run(b: &mut test::Bencher) {
+fn bench_fibonacci_run(b: &mut test::Bencher) {
     let wasm = wabt::wat2wasm(FIBONACCI).unwrap();
     let module = translate(&wasm).unwrap();
 
     b.iter(|| unsafe { module.execute_func::<_, u32>(0, (20,)) });
 }
+
+#[bench]
+fn bench_fibonacci_baseline(b: &mut test::Bencher) {
+    fn fib(n: i32) -> i32 {
+        if n == 0 || n == 1 {
+            1
+        } else {
+            fib(n - 1) + fib(n - 2)
+        }
+    }
+
+    b.iter(|| test::black_box(fib(test::black_box(20))));
+}