diff --git a/build.rs b/build.rs index 98a31a2c10..74e3e2ad02 100644 --- a/build.rs +++ b/build.rs @@ -171,7 +171,6 @@ fn write_testsuite_tests( /// Ignore tests that aren't supported yet. fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { - let target = env::var("TARGET").unwrap(); match strategy { #[cfg(feature = "lightbeam")] "Lightbeam" => match (testsuite, testname) { @@ -209,16 +208,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { return true; } - // FIXME(#1569) stack protection isn't implemented yet and these - // tests segfault. - ("spec_testsuite", "skip_stack_guard_page") - | ("spec_testsuite", "stack") - | ("misc_testsuite", "stack_overflow") - if target.contains("aarch64") => - { - return true - } - _ => {} }, _ => panic!("unrecognized strategy"), diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 860089d23d..8fb085725f 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -111,6 +111,7 @@ fn compute_arg_locs(call_conv: isa::CallConv, params: &[ir::AbiParam]) -> (Vec {} _ => panic!( "Unsupported argument purpose {:?} in signature: {:?}", @@ -192,6 +193,20 @@ pub struct AArch64ABIBody { call_conv: isa::CallConv, /// The settings controlling this function's compilation. flags: settings::Flags, + /// Whether or not this function is a "leaf", meaning it calls no other + /// functions + is_leaf: bool, + /// If this function has a stack limit specified, then `Reg` is where the + /// stack limit will be located after the instructions specified have been + /// executed. + /// + /// Note that this is intended for insertion into the prologue, if + /// present. Also note that because the instructions here execute in the + /// prologue this happens after legalization/register allocation/etc so we + /// need to be extremely careful with each instruction. The instructions are + /// manually register-allocated and carefully only use caller-saved + /// registers and keep nothing live after this sequence of instructions. + stack_limit: Option<(Reg, Vec)>, } fn in_int_reg(ty: ir::Type) -> bool { @@ -209,6 +224,85 @@ fn in_vec_reg(ty: ir::Type) -> bool { } } +/// Generates the instructions necessary for the `gv` to be materialized into a +/// register. +/// +/// This function will return a register that will contain the result of +/// evaluating `gv`. It will also return any instructions necessary to calculate +/// the value of the register. +/// +/// Note that global values are typically lowered to instructions via the +/// standard legalization pass. Unfortunately though prologue generation happens +/// so late in the pipeline that we can't use these legalization passes to +/// generate the instructions for `gv`. As a result we duplicate some lowering +/// of `gv` here and support only some global values. This is similar to what +/// the x86 backend does for now, and hopefully this can be somewhat cleaned up +/// in the future too! +/// +/// Also note that this function will make use of `writable_spilltmp_reg()` as a +/// temporary register to store values in if necessary. Currently after we write +/// to this register there's guaranteed to be no spilled values between where +/// it's used, because we're not participating in register allocation anyway! +fn gen_stack_limit(f: &ir::Function, abi: &ABISig, gv: ir::GlobalValue) -> (Reg, Vec) { + let mut insts = Vec::new(); + let reg = generate_gv(f, abi, gv, &mut insts); + return (reg, insts); + + fn generate_gv( + f: &ir::Function, + abi: &ABISig, + gv: ir::GlobalValue, + insts: &mut Vec, + ) -> Reg { + match f.global_values[gv] { + // Return the direct register the vmcontext is in + ir::GlobalValueData::VMContext => { + get_special_purpose_param_register(f, abi, ir::ArgumentPurpose::VMContext) + .expect("no vmcontext parameter found") + } + // Load our base value into a register, then load from that register + // in to a temporary register. + ir::GlobalValueData::Load { + base, + offset, + global_type: _, + readonly: _, + } => { + let base = generate_gv(f, abi, base, insts); + let into_reg = writable_spilltmp_reg(); + let mem = if let Some(offset) = + UImm12Scaled::maybe_from_i64(offset.into(), ir::types::I8) + { + MemArg::UnsignedOffset(base, offset) + } else { + let offset: i64 = offset.into(); + insts.extend(Inst::load_constant(into_reg, offset as u64)); + MemArg::RegReg(base, into_reg.to_reg()) + }; + insts.push(Inst::ULoad64 { + rd: into_reg, + mem, + srcloc: None, + }); + return into_reg.to_reg(); + } + ref other => panic!("global value for stack limit not supported: {}", other), + } + } +} + +fn get_special_purpose_param_register( + f: &ir::Function, + abi: &ABISig, + purpose: ir::ArgumentPurpose, +) -> Option { + let idx = f.signature.special_param_index(purpose)?; + match abi.args[idx] { + ABIArg::Reg(reg, _) => Some(reg.to_reg()), + ABIArg::Stack(..) => None, + } +} + impl AArch64ABIBody { /// Create a new body ABI instance. pub fn new(f: &ir::Function, flags: settings::Flags) -> Self { @@ -238,6 +332,15 @@ impl AArch64ABIBody { stackslots.push(off); } + // Figure out what instructions, if any, will be needed to check the + // stack limit. This can either be specified as a special-purpose + // argument or as a global value which often calculates the stack limit + // from the arguments. + let stack_limit = + get_special_purpose_param_register(f, &sig, ir::ArgumentPurpose::StackLimit) + .map(|reg| (reg, Vec::new())) + .or_else(|| f.stack_limit.map(|gv| gen_stack_limit(f, &sig, gv))); + Self { sig, stackslots, @@ -247,6 +350,8 @@ impl AArch64ABIBody { frame_size: None, call_conv, flags, + is_leaf: f.is_leaf(), + stack_limit, } } @@ -262,6 +367,97 @@ impl AArch64ABIBody { 16 // frame pointer + return address. } } + + /// Inserts instructions necessary for checking the stack limit into the + /// prologue. + /// + /// This function will generate instructions necessary for perform a stack + /// check at the header of a function. The stack check is intended to trap + /// if the stack pointer goes below a particular threshold, preventing stack + /// overflow in wasm or other code. The `stack_limit` argument here is the + /// register which holds the threshold below which we're supposed to trap. + /// This function is known to allocate `stack_size` bytes and we'll push + /// instructions onto `insts`. + /// + /// Note that the instructions generated here are special because this is + /// happening so late in the pipeline (e.g. after register allocation). This + /// means that we need to do manual register allocation here and also be + /// careful to not clobber any callee-saved or argument registers. For now + /// this routine makes do with the `writable_spilltmp_reg` as one temporary + /// register, and a second register of `x16` which is caller-saved. This + /// should be fine for us since no spills should happen in this sequence of + /// instructions, so our register won't get accidentally clobbered. + /// + /// No values can be live after the prologue, but in this case that's ok + /// because we just need to perform a stack check before progressing with + /// the rest of the function. + fn insert_stack_check(&self, stack_limit: Reg, stack_size: u32, insts: &mut Vec) { + // With no explicit stack allocated we can just emit the simple check of + // the stack registers against the stack limit register, and trap if + // it's out of bounds. + if stack_size == 0 { + return push_check(stack_limit, insts); + } + + // Note that the 32k stack size here is pretty special. See the + // documentation in x86/abi.rs for why this is here. The general idea is + // that we're protecting against overflow in the addition that happens + // below. + if stack_size >= 32 * 1024 { + push_check(stack_limit, insts); + } + + // Add the `stack_size` to `stack_limit`, placing the result in + // `scratch`. + // + // Note though that `stack_limit`'s register may be the same as + // `scratch`. If our stack size doesn't fit into an immediate this + // means we need a second scratch register for loading the stack size + // into a register. We use `x16` here since it's caller-saved and we're + // in the function prologue and nothing else is allocated to it yet. + let scratch = writable_spilltmp_reg(); + let stack_size = u64::from(stack_size); + if let Some(imm12) = Imm12::maybe_from_u64(stack_size) { + insts.push(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: scratch, + rn: stack_limit, + imm12, + }); + } else { + let scratch2 = 16; + insts.extend(Inst::load_constant( + Writable::from_reg(xreg(scratch2)), + stack_size.into(), + )); + insts.push(Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd: scratch, + rn: stack_limit, + rm: xreg(scratch2), + extendop: ExtendOp::UXTX, + }); + } + push_check(scratch.to_reg(), insts); + + fn push_check(stack_limit: Reg, insts: &mut Vec) { + insts.push(Inst::AluRRR { + alu_op: ALUOp::SubS64XR, + rd: writable_zero_reg(), + rn: stack_reg(), + rm: stack_limit, + }); + insts.push(Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(8), + // Here `Hs` == "higher or same" when interpreting the two + // operands as unsigned integers. + kind: CondBrKind::Cond(Cond::Hs), + }); + insts.push(Inst::Udf { + trap_info: (ir::SourceLoc::default(), ir::TrapCode::StackOverflow), + }); + } + } } fn load_stack_from_fp(fp_offset: i64, into_reg: Writable, ty: Type) -> Inst { @@ -682,31 +878,41 @@ impl ABIBody for AArch64ABIBody { } let total_stacksize = (total_stacksize + 15) & !15; // 16-align the stack. - if !self.call_conv.extends_baldrdash() && total_stacksize > 0 { - // sub sp, sp, #total_stacksize - if let Some(imm12) = Imm12::maybe_from_u64(total_stacksize as u64) { - let sub_inst = Inst::AluRRImm12 { - alu_op: ALUOp::Sub64, - rd: writable_stack_reg(), - rn: stack_reg(), - imm12, - }; - insts.push(sub_inst); - } else { - let tmp = writable_spilltmp_reg(); - let const_inst = Inst::LoadConst64 { - rd: tmp, - const_data: total_stacksize as u64, - }; - let sub_inst = Inst::AluRRRExtend { - alu_op: ALUOp::Sub64, - rd: writable_stack_reg(), - rn: stack_reg(), - rm: tmp.to_reg(), - extendop: ExtendOp::UXTX, - }; - insts.push(const_inst); - insts.push(sub_inst); + if !self.call_conv.extends_baldrdash() { + // Leaf functions with zero stack don't need a stack check if one's + // specified, otherwise always insert the stack check. + if total_stacksize > 0 || !self.is_leaf { + if let Some((reg, stack_limit_load)) = &self.stack_limit { + insts.extend_from_slice(stack_limit_load); + self.insert_stack_check(*reg, total_stacksize, &mut insts); + } + } + if total_stacksize > 0 { + // sub sp, sp, #total_stacksize + if let Some(imm12) = Imm12::maybe_from_u64(total_stacksize as u64) { + let sub_inst = Inst::AluRRImm12 { + alu_op: ALUOp::Sub64, + rd: writable_stack_reg(), + rn: stack_reg(), + imm12, + }; + insts.push(sub_inst); + } else { + let tmp = writable_spilltmp_reg(); + let const_inst = Inst::LoadConst64 { + rd: tmp, + const_data: total_stacksize as u64, + }; + let sub_inst = Inst::AluRRRExtend { + alu_op: ALUOp::Sub64, + rd: writable_stack_reg(), + rn: stack_reg(), + rm: tmp.to_reg(), + extendop: ExtendOp::UXTX, + }; + insts.push(const_inst); + insts.push(sub_inst); + } } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index d012521ecd..e3a9043147 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -347,6 +347,7 @@ impl MachInstEmit for Inst { ALUOp::AddS64 => 0b10101011_000, ALUOp::SubS32 => 0b01101011_000, ALUOp::SubS64 => 0b11101011_000, + ALUOp::SubS64XR => 0b11101011_001, ALUOp::SDiv64 => 0b10011010_110, ALUOp::UDiv64 => 0b10011010_110, ALUOp::RotR32 | ALUOp::Lsr32 | ALUOp::Asr32 | ALUOp::Lsl32 => 0b00011010_110, @@ -369,12 +370,16 @@ impl MachInstEmit for Inst { ALUOp::Lsr32 | ALUOp::Lsr64 => 0b001001, ALUOp::Asr32 | ALUOp::Asr64 => 0b001010, ALUOp::Lsl32 | ALUOp::Lsl64 => 0b001000, + ALUOp::SubS64XR => 0b011000, _ => 0b000000, }; debug_assert_ne!(writable_stack_reg(), rd); - // The stack pointer is the zero register in this context, so this might be an - // indication that something is wrong. - debug_assert_ne!(stack_reg(), rn); + // The stack pointer is the zero register if this instruction + // doesn't have access to extended registers, so this might be + // an indication that something is wrong. + if alu_op != ALUOp::SubS64XR { + debug_assert_ne!(stack_reg(), rn); + } debug_assert_ne!(stack_reg(), rm); sink.put4(enc_arith_rrr(top11, bit15_10, rd, rn, rm)); } @@ -2149,6 +2154,17 @@ mod test { "subs x10, x11, x12, LSL 23", )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::SubS64XR, + rd: writable_zero_reg(), + rn: stack_reg(), + rm: xreg(12), + }, + "FF632CEB", + "subs xzr, sp, x12", + )); + insns.push(( Inst::AluRRRR { alu_op: ALUOp::MAdd32, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 52330e66e6..98776a490c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -66,6 +66,8 @@ pub enum ALUOp { SubS32, /// Sub, setting flags SubS64, + /// Sub, setting flags, using extended registers + SubS64XR, /// Multiply-add MAdd32, /// Multiply-add @@ -1931,6 +1933,7 @@ impl ShowWithRRU for Inst { ALUOp::AddS64 => ("adds", InstSize::Size64), ALUOp::SubS32 => ("subs", InstSize::Size32), ALUOp::SubS64 => ("subs", InstSize::Size64), + ALUOp::SubS64XR => ("subs", InstSize::Size64), ALUOp::MAdd32 => ("madd", InstSize::Size32), ALUOp::MAdd64 => ("madd", InstSize::Size64), ALUOp::MSub32 => ("msub", InstSize::Size32), diff --git a/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif b/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif new file mode 100644 index 0000000000..4b39a81b6f --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/stack-limit.clif @@ -0,0 +1,190 @@ +test vcode +target aarch64 + +function %foo() { +block0: + return +} + +function %stack_limit_leaf_zero(i64 stack_limit) { +block0(v0: i64): + return +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %stack_limit_gv_leaf_zero(i64 vmctx) { + gv0 = vmctx + gv1 = load.i64 notrap aligned gv0 + gv2 = load.i64 notrap aligned gv1+4 + stack_limit = gv2 +block0(v0: i64): + return +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %stack_limit_call_zero(i64 stack_limit) { + fn0 = %foo() +block0(v0: i64): + call fn0() + return +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: subs xzr, sp, x0 +; nextln: b.hs 8 +; nextln: udf +; nextln: bl 0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %stack_limit_gv_call_zero(i64 vmctx) { + gv0 = vmctx + gv1 = load.i64 notrap aligned gv0 + gv2 = load.i64 notrap aligned gv1+4 + stack_limit = gv2 + fn0 = %foo() +block0(v0: i64): + call fn0() + return +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldr x15, [x0] +; nextln: ldr x15, [x15, #4] +; nextln: subs xzr, sp, x15 +; nextln: b.hs 8 +; nextln: udf +; nextln: bl 0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %stack_limit(i64 stack_limit) { + ss0 = explicit_slot 168 +block0(v0: i64): + return +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add x15, x0, #176 +; nextln: subs xzr, sp, x15 +; nextln: b.hs 8 +; nextln: udf +; nextln: sub sp, sp, #176 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %huge_stack_limit(i64 stack_limit) { + ss0 = explicit_slot 400000 +block0(v0: i64): + return +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: subs xzr, sp, x0 +; nextln: b.hs 8 +; nextln: udf +; nextln: movz x16, #6784 +; nextln: movk x16, #6, LSL #16 +; nextln: add x15, x0, x16, UXTX +; nextln: subs xzr, sp, x15 +; nextln: b.hs 8 +; nextln: udf +; nextln: ldr x15, 8 ; b 12 ; data 400000 +; nextln: sub sp, sp, x15, UXTX +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %limit_preamble(i64 vmctx) { + gv0 = vmctx + gv1 = load.i64 notrap aligned gv0 + gv2 = load.i64 notrap aligned gv1+4 + stack_limit = gv2 + ss0 = explicit_slot 20 +block0(v0: i64): + return +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldr x15, [x0] +; nextln: ldr x15, [x15, #4] +; nextln: add x15, x15, #32 +; nextln: subs xzr, sp, x15 +; nextln: b.hs 8 +; nextln: udf +; nextln: sub sp, sp, #32 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %limit_preamble_huge(i64 vmctx) { + gv0 = vmctx + gv1 = load.i64 notrap aligned gv0 + gv2 = load.i64 notrap aligned gv1+4 + stack_limit = gv2 + ss0 = explicit_slot 400000 +block0(v0: i64): + return +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldr x15, [x0] +; nextln: ldr x15, [x15, #4] +; nextln: subs xzr, sp, x15 +; nextln: b.hs 8 +; nextln: udf +; nextln: movz x16, #6784 +; nextln: movk x16, #6, LSL #16 +; nextln: add x15, x15, x16, UXTX +; nextln: subs xzr, sp, x15 +; nextln: b.hs 8 +; nextln: udf +; nextln: ldr x15, 8 ; b 12 ; data 400000 +; nextln: sub sp, sp, x15, UXTX +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %limit_preamble_huge_offset(i64 vmctx) { + gv0 = vmctx + gv1 = load.i64 notrap aligned gv0+400000 + stack_limit = gv1 + ss0 = explicit_slot 20 +block0(v0: i64): + return +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x15, #6784 +; nextln: movk x15, #6, LSL #16 +; nextln: ldr x15, [x0, x15] +; nextln: add x15, x15, #32 +; nextln: subs xzr, sp, x15 +; nextln: b.hs 8 +; nextln: udf +; nextln: sub sp, sp, #32 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/crates/api/src/frame_info.rs b/crates/api/src/frame_info.rs index 0a4b2b6cf6..7894e36945 100644 --- a/crates/api/src/frame_info.rs +++ b/crates/api/src/frame_info.rs @@ -94,7 +94,11 @@ impl GlobalFrameInfo { // the function, because otherwise something is buggy along the way and // not accounting for all the instructions. This isn't super critical // though so we can omit this check in release mode. - debug_assert!(pos.is_some(), "failed to find instruction for {:x}", pc); + // + // FIXME(#1521) aarch64 instruction info isn't quite up-to-par yet. + if !cfg!(target_arch = "aarch64") { + debug_assert!(pos.is_some(), "failed to find instruction for {:x}", pc); + } let instr = match pos { Some(pos) => func.instr_map.instructions[pos].srcloc, diff --git a/foo.c b/foo.c new file mode 100644 index 0000000000..15b69f9df3 --- /dev/null +++ b/foo.c @@ -0,0 +1,10 @@ +void bar(); +void baz(); +int foo(int b, int a) { + if (b) { + baz(); + return a + 3; + } + bar(); + return a + 4; +} diff --git a/foo.o b/foo.o new file mode 100644 index 0000000000..33569782cc Binary files /dev/null and b/foo.o differ diff --git a/foo.s b/foo.s new file mode 100644 index 0000000000..609fc5e9c6 --- /dev/null +++ b/foo.s @@ -0,0 +1,32 @@ + .arch armv8-a + .file "foo.c" + .text + .align 2 + .global foo + .type foo, %function +foo: +.LFB0: + .cfi_startproc + stp x29, x30, [sp, -32]! + .cfi_def_cfa_offset 32 + .cfi_offset 29, -32 + .cfi_offset 30, -24 + add x29, sp, 0 + .cfi_def_cfa_register 29 + str x19, [sp, 16] + .cfi_offset 19, -16 + mov w19, w0 + bl bar + add w0, w19, 3 + ldr x19, [sp, 16] + ldp x29, x30, [sp], 32 + .cfi_restore 30 + .cfi_restore 29 + .cfi_restore 19 + .cfi_def_cfa 31, 0 + ret + .cfi_endproc +.LFE0: + .size foo, .-foo + .ident "GCC: (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) 7.5.0" + .section .note.GNU-stack,"",@progbits diff --git a/tests/all/iloop.rs b/tests/all/iloop.rs index 6ea292ce19..9e0f5aa851 100644 --- a/tests/all/iloop.rs +++ b/tests/all/iloop.rs @@ -99,7 +99,6 @@ fn loop_interrupt_from_afar() -> anyhow::Result<()> { } #[test] -#[cfg_attr(target_arch = "aarch64", ignore)] // FIXME(#1569) fn function_interrupt_from_afar() -> anyhow::Result<()> { // Create an instance which calls an imported function on each iteration of // the loop so we can count the number of loop iterations we've executed so diff --git a/tests/all/stack_overflow.rs b/tests/all/stack_overflow.rs index c03b8430b8..283426130e 100644 --- a/tests/all/stack_overflow.rs +++ b/tests/all/stack_overflow.rs @@ -2,7 +2,6 @@ use std::sync::atomic::{AtomicUsize, Ordering::SeqCst}; use wasmtime::*; #[test] -#[cfg_attr(target_arch = "aarch64", ignore)] // FIXME(#1569) fn host_always_has_some_stack() -> anyhow::Result<()> { static HITS: AtomicUsize = AtomicUsize::new(0); // assume hosts always have at least 512k of stack