diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 1d1068a887..0984575981 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -1425,9 +1425,54 @@ pub(crate) fn lower_icmp>( } } } else if !ty.is_vector() { - let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode); + + let is_overflow = condcode == IntCC::Overflow || condcode == IntCC::NotOverflow; + let is_small_type = ty == I8 || ty == I16; + let (rn, rm) = if is_overflow && is_small_type { + // Overflow checks for non native types require additional instructions, other than + // just the extend op. + // + // TODO: Codegen improvements here: + // * Merge the second sxt{h,b} into the sub instruction. + // * We can especially improve codegen here if we can return a different flag out of + // this function. That way we can tell the caller to use the 'ne' flag and save + // the last 3 instructions. + // + // sxt{h,b} w0, w0 + // sxt{h,b} w1, w1 + // sub w0, w0, w1 + // cmp w0, w0, sxt{h,b} + // cset w0, ne + // mov w1, #0x80000000 + // cmp w1, w0 + + let extend_op = if ty == I8 { + ExtendOp::SXTB + } else { + ExtendOp::SXTH + }; + let tmp1 = ctx.alloc_tmp(I32).only_reg().unwrap(); + let tmp2 = ctx.alloc_tmp(I32).only_reg().unwrap(); + ctx.emit(alu_inst_imm12(ALUOp::Sub32, tmp1, rn, rm)); + ctx.emit(alu_inst_imm12( + ALUOp::SubS32, + writable_zero_reg(), + tmp1.to_reg(), + ResultRSEImm12::RegExtend(tmp1.to_reg(), extend_op), + )); + ctx.emit(Inst::CSet { + rd: tmp2, + cond: Cond::Ne, + }); + lower_constant_u64(ctx, tmp1, 0x8000_0000); + (tmp1.to_reg(), ResultRSEImm12::Reg(tmp2.to_reg())) + } else { + (rn, rm) + }; + + let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm)); if let IcmpOutput::Register(rd) = output {