Aarch64 codegen: represent bool true as -1, not 1.

It seems that this is actually the correct behavior for bool types wider
than `b1`; some of the vector instruction optimizations depend on bool
lanes representing false and true as all-zeroes and all-ones
respectively. For `b8`..`b64`, this results in an extra negation after a
`cset` when a bool is produced by an `icmp`/`fcmp`, but the most common
case (`b1`) is unaffected, because an all-ones one-bit value is just
`1`.

An example of this assumption can be seen here:

399ee0a54c/cranelift/codegen/src/simple_preopt.rs (L956)

Thanks to Joey Gouly of ARM for noting this issue while implementing
SIMD support, and digging into the source (finding the above example) to
determine the correct behavior.
This commit is contained in:
Chris Fallin
2020-07-21 14:04:23 -07:00
parent c420f65214
commit b8f6d53a6b
5 changed files with 108 additions and 70 deletions

View File

@@ -1380,13 +1380,6 @@ impl MachInstEmit for Inst {
&Inst::MovFromNZCV { rd } => { &Inst::MovFromNZCV { rd } => {
sink.put4(0xd53b4200 | machreg_to_gpr(rd.to_reg())); sink.put4(0xd53b4200 | machreg_to_gpr(rd.to_reg()));
} }
&Inst::CondSet { rd, cond } => {
sink.put4(
0b100_11010100_11111_0000_01_11111_00000
| (cond.invert().bits() << 12)
| machreg_to_gpr(rd.to_reg()),
);
}
&Inst::Extend { &Inst::Extend {
rd, rd,
rn, rn,

View File

@@ -1888,14 +1888,6 @@ fn test_aarch64_binemit() {
"1B423BD5", "1B423BD5",
"mrs x27, nzcv", "mrs x27, nzcv",
)); ));
insns.push((
Inst::CondSet {
rd: writable_xreg(5),
cond: Cond::Hi,
},
"E5979F9A",
"cset x5, hi",
));
insns.push(( insns.push((
Inst::VecDup { Inst::VecDup {
rd: writable_vreg(25), rd: writable_vreg(25),

View File

@@ -809,12 +809,6 @@ pub enum Inst {
rd: Writable<Reg>, rd: Writable<Reg>,
}, },
/// Set a register to 1 if condition, else 0.
CondSet {
rd: Writable<Reg>,
cond: Cond,
},
/// A machine call instruction. N.B.: this allows only a +/- 128MB offset (it uses a relocation /// A machine call instruction. N.B.: this allows only a +/- 128MB offset (it uses a relocation
/// of type `Reloc::Arm64Call`); if the destination distance is not `RelocDistance::Near`, the /// of type `Reloc::Arm64Call`); if the destination distance is not `RelocDistance::Near`, the
/// code should use a `LoadExtName` / `CallInd` sequence instead, allowing an arbitrary 64-bit /// code should use a `LoadExtName` / `CallInd` sequence instead, allowing an arbitrary 64-bit
@@ -1358,9 +1352,6 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
&Inst::MovFromNZCV { rd } => { &Inst::MovFromNZCV { rd } => {
collector.add_def(rd); collector.add_def(rd);
} }
&Inst::CondSet { rd, .. } => {
collector.add_def(rd);
}
&Inst::Extend { rd, rn, .. } => { &Inst::Extend { rd, rn, .. } => {
collector.add_def(rd); collector.add_def(rd);
collector.add_use(rn); collector.add_use(rn);
@@ -1954,9 +1945,6 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
&mut Inst::MovFromNZCV { ref mut rd } => { &mut Inst::MovFromNZCV { ref mut rd } => {
map_def(mapper, rd); map_def(mapper, rd);
} }
&mut Inst::CondSet { ref mut rd, .. } => {
map_def(mapper, rd);
}
&mut Inst::Extend { &mut Inst::Extend {
ref mut rd, ref mut rd,
ref mut rn, ref mut rn,
@@ -2830,11 +2818,6 @@ impl Inst {
let rd = rd.to_reg().show_rru(mb_rru); let rd = rd.to_reg().show_rru(mb_rru);
format!("mrs {}, nzcv", rd) format!("mrs {}, nzcv", rd)
} }
&Inst::CondSet { rd, cond } => {
let rd = rd.to_reg().show_rru(mb_rru);
let cond = cond.show_rru(mb_rru);
format!("cset {}, {}", rd, cond)
}
&Inst::Extend { &Inst::Extend {
rd, rd,
rn, rn,

View File

@@ -1014,6 +1014,24 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, i
} }
} }
/// Convert a 0 / 1 result, such as from a conditional-set instruction, into a 0
/// / -1 (all-ones) result as expected for bool operations.
pub(crate) fn normalize_bool_result<C: LowerCtx<I = Inst>>(
ctx: &mut C,
insn: IRInst,
rd: Writable<Reg>,
) {
// A boolean is 0 / -1; if output width is > 1, negate.
if ty_bits(ctx.output_ty(insn, 0)) > 1 {
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Sub64,
rd,
rn: zero_reg(),
rm: rd.to_reg(),
});
}
}
//============================================================================= //=============================================================================
// Lowering-backend trait implementation. // Lowering-backend trait implementation.

View File

@@ -1208,6 +1208,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::CSet { rd, cond }); ctx.emit(Inst::CSet { rd, cond });
normalize_bool_result(ctx, insn, rd);
} }
Opcode::Trueff => { Opcode::Trueff => {
@@ -1217,6 +1218,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn); lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::CSet { rd, cond }); ctx.emit(Inst::CSet { rd, cond });
normalize_bool_result(ctx, insn, rd);
} }
Opcode::IsNull | Opcode::IsInvalid => { Opcode::IsNull | Opcode::IsInvalid => {
@@ -1240,6 +1242,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap()); let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value)); ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
ctx.emit(Inst::CSet { rd, cond: Cond::Eq }); ctx.emit(Inst::CSet { rd, cond: Cond::Eq });
normalize_bool_result(ctx, insn, rd);
} }
Opcode::Copy => { Opcode::Copy => {
@@ -1249,49 +1252,95 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::gen_move(rd, rn, ty)); ctx.emit(Inst::gen_move(rd, rn, ty));
} }
Opcode::Bint | Opcode::Breduce | Opcode::Bextend | Opcode::Ireduce => { Opcode::Breduce | Opcode::Ireduce => {
// If this is a Bint from a Trueif/Trueff/IsNull/IsInvalid, then the result is already // Smaller integers/booleans are stored with high-order bits
// 64-bit-zero-extended, even if the CLIF type doesn't say so, because it was produced // undefined, so we can simply do a copy.
// by a CSet. In this case, we do not need to do any zero-extension. let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let input_info = ctx.get_input(insn, 0);
let src_op = input_info
.inst
.map(|(src_inst, _)| ctx.data(src_inst).opcode());
let narrow_mode = match (src_op, op) {
(Some(Opcode::Trueif), Opcode::Bint)
| (Some(Opcode::Trueff), Opcode::Bint)
| (Some(Opcode::IsNull), Opcode::Bint)
| (Some(Opcode::IsInvalid), Opcode::Bint) => NarrowValueMode::None,
_ => NarrowValueMode::ZeroExtend64,
};
// All of these ops are simply a move from a zero-extended source.
// Here is why this works, in each case:
//
// - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we
// merely need to zero-extend here.
//
// - Breduce, Bextend: changing width of a boolean. We represent a
// bool as a 0 or 1, so again, this is a zero-extend / no-op.
//
// - Ireduce: changing width of an integer. Smaller ints are stored
// with undefined high-order bits, so we can simply do a copy.
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
let ty = ctx.input_ty(insn, 0); let ty = ctx.input_ty(insn, 0);
ctx.emit(Inst::gen_move(rd, rn, ty)); ctx.emit(Inst::gen_move(rd, rn, ty));
} }
Opcode::Bmask => { Opcode::Bextend | Opcode::Bmask => {
// Bool is {0, 1}, so we can subtract from 0 to get all-1s. // Bextend and Bmask both simply sign-extend. This works for:
// - Bextend, because booleans are stored as 0 / -1, so we
// sign-extend the -1 to a -1 in the wider width.
// - Bmask, because the resulting integer mask value must be
// all-ones (-1) if the argument is true.
//
// For a sign-extension from a 1-bit value (Case 1 below), we need
// to do things a bit specially, because the ISA does not have a
// 1-to-N-bit sign extension instruction. For 8-bit or wider
// sources (Case 2 below), we do a sign extension normally.
let from_ty = ctx.input_ty(insn, 0);
let to_ty = ctx.output_ty(insn, 0);
let from_bits = ty_bits(from_ty);
let to_bits = ty_bits(to_ty);
assert!(
from_bits <= 64 && to_bits <= 64,
"Vector Bextend not supported yet"
);
assert!(from_bits <= to_bits);
if from_bits == to_bits {
// Nothing.
} else if from_bits == 1 {
assert!(to_bits >= 8);
// Case 1: 1-bit to N-bit extension: AND the LSB of source into
// dest, generating a value of 0 or 1, then negate to get
// 0x000... or 0xfff...
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64); // AND Rdest, Rsource, #1
ctx.emit(Inst::AluRRImmLogic {
alu_op: ALUOp::And64,
rd,
rn,
imml: ImmLogic::maybe_from_u64(1, I64).unwrap(),
});
// SUB Rdest, XZR, Rdest (i.e., NEG Rdest)
ctx.emit(Inst::AluRRR { ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Sub64, alu_op: ALUOp::Sub64,
rd, rd,
rn: zero_reg(), rn: zero_reg(),
rm, rm: rd.to_reg(),
});
} else {
// Case 2: 8-or-more-bit to N-bit extension: just sign-extend. A
// `true` (all ones, or `-1`) will be extended to -1 with the
// larger width.
assert!(from_bits >= 8);
let narrow_mode = if to_bits == 64 {
NarrowValueMode::SignExtend64
} else {
assert!(to_bits <= 32);
NarrowValueMode::SignExtend32
};
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
let rd = get_output_reg(ctx, outputs[0]);
ctx.emit(Inst::gen_move(rd, rn, to_ty));
}
}
Opcode::Bint => {
// Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
// out the LSB to give a 0 / 1-valued integer result.
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]);
let output_bits = ty_bits(ctx.output_ty(insn, 0));
let (imm_ty, alu_op) = if output_bits > 32 {
(I64, ALUOp::And64)
} else {
(I32, ALUOp::And32)
};
ctx.emit(Inst::AluRRImmLogic {
alu_op,
rd,
rn,
imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
}); });
} }
@@ -1369,7 +1418,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode); let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm)); ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
ctx.emit(Inst::CondSet { cond, rd }); ctx.emit(Inst::CSet { cond, rd });
normalize_bool_result(ctx, insn, rd);
} else { } else {
let rm = put_input_in_reg(ctx, inputs[1], narrow_mode); let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
lower_vector_compare(ctx, rd, rn, rm, ty, cond)?; lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
@@ -1394,7 +1444,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
_ => panic!("Bad float size"), _ => panic!("Bad float size"),
} }
ctx.emit(Inst::CondSet { cond, rd }); ctx.emit(Inst::CSet { cond, rd });
normalize_bool_result(ctx, insn, rd);
} else { } else {
lower_vector_compare(ctx, rd, rn, rm, ty, cond)?; lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
} }
@@ -1659,6 +1710,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}); });
ctx.emit(Inst::CSet { rd, cond: Cond::Ne }); ctx.emit(Inst::CSet { rd, cond: Cond::Ne });
normalize_bool_result(ctx, insn, rd);
} }
Opcode::Shuffle Opcode::Shuffle