Cranelift: fix #3953: rework single/multiple-use logic in lowering. (#4061)

* Cranelift: fix #3953: rework single/multiple-use logic in lowering.

This PR addresses the longstanding issue with loads trying to merge
into compares on x86-64, and more generally, with the lowering
framework falsely recognizing "single uses" of one op by
another (which would normally allow merging of side-effecting ops like
loads) when there is *indirect* duplication.

To fix this, we replace the direct `value_uses` count with a
transitive notion of uniqueness (not unlike Rust's `&`/`&mut` and how
a `&mut` downgrades to `&` when accessed through another `&`!). A
value is used multiple times transitively if it has multiple direct
uses, or is used by another op that is used multiple times
transitively.

The canonical example of badness is:

```
    v1 := load
    v2 := ifcmp v1, ...
    v3 := selectif v2, ...
    v4 := selectif v2, ...
```

both `v3` and `v4` effectively merge the `ifcmp` (`v2`), so even
though the use of `v1` is "unique", it is codegenned twice. This is
why we ~~can't have nice things~~ can't merge loads into
compares (#3953).

There is quite a subtle and interesting design space around this
problem and how we might solve it. See the long doc-comment on
`ValueUseState` in this PR for more justification for the particular
design here. In particular, this design deliberately simplifies a bit
relative to an "optimal" solution: some uses can *become* unique
depending on merging, but we don't design our data structures for such
updates because that would require significant extra costly
tracking (some sort of transitive refcounting). For example, in the
above, if `selectif` somehow did not merge `ifcmp`, then we would only
codegen the `ifcmp` once into its result register (and use that
register twice); then the load *is* uniquely used, and could be
merged. But that requires transitioning from "multiple use" back to
"unique use" with careful tracking as we do pattern-matching, which
I've chosen to make out-of-scope here for now. In practice, I don't
think it will matter too much (and we can always improve later).

With this PR, we can now re-enable load-op merging for compares. A
subsequent commit does this.

* Update x64 backend to allow load-op merging for `cmp`.

* Update filetests.

* Add test for cmp-mem merging on x64.

* Comment fixes.

* Rework ValueUseState analysis for better performance.

* Update s390x filetest: iadd_ifcout cannot merge loads anymore because it has multiple outputs (ValueUseState limitation)

* Address review comments.
This commit is contained in:
Chris Fallin
2022-04-22 18:00:48 -07:00
committed by GitHub
parent 6a36a1d15d
commit e4b7c8a737
13 changed files with 547 additions and 298 deletions

View File

@@ -306,7 +306,8 @@ fn put_input_in_rs<C: LowerCtx<I = Inst>>(
narrow_mode: NarrowValueMode,
) -> ResultRS {
let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
if let Some((insn, 0)) = inputs.inst {
// Unique or non-unique use is fine for merging here.
if let Some((insn, 0)) = inputs.inst.as_inst() {
let op = ctx.data(insn).opcode();
if op == Opcode::Ishl {
@@ -353,7 +354,7 @@ fn get_as_extended_value<C: LowerCtx<I = Inst>>(
narrow_mode: NarrowValueMode,
) -> Option<(Value, ExtendOp)> {
let inputs = ctx.get_value_as_source_or_const(val);
let (insn, n) = inputs.inst?;
let (insn, n) = inputs.inst.as_inst()?;
if n != 0 {
return None;
}
@@ -1125,7 +1126,7 @@ pub(crate) fn maybe_input_insn<C: LowerCtx<I = Inst>>(
inputs,
op
);
if let Some((src_inst, _)) = inputs.inst {
if let Some((src_inst, _)) = inputs.inst.as_inst() {
let data = c.data(src_inst);
log::trace!(" -> input inst {:?}", data);
if data.opcode() == op {
@@ -1161,14 +1162,14 @@ pub(crate) fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
conv: Opcode,
) -> Option<IRInst> {
let inputs = c.get_input_as_source_or_const(input.insn, input.input);
if let Some((src_inst, _)) = inputs.inst {
if let Some((src_inst, _)) = inputs.inst.as_inst() {
let data = c.data(src_inst);
if data.opcode() == op {
return Some(src_inst);
}
if data.opcode() == conv {
let inputs = c.get_input_as_source_or_const(src_inst, 0);
if let Some((src_inst, _)) = inputs.inst {
if let Some((src_inst, _)) = inputs.inst.as_inst() {
let data = c.data(src_inst);
if data.opcode() == op {
return Some(src_inst);

View File

@@ -12,7 +12,7 @@ use super::{
NZCV,
};
use crate::isa::aarch64::settings::Flags as IsaFlags;
use crate::machinst::isle::*;
use crate::machinst::{isle::*, InputSourceInst};
use crate::settings::Flags;
use crate::{
binemit::CodeOffset,
@@ -245,7 +245,7 @@ where
fn sinkable_atomic_load(&mut self, val: Value) -> Option<SinkableAtomicLoad> {
let input = self.lower_ctx.get_value_as_source_or_const(val);
if let Some((atomic_load, 0)) = input.inst {
if let InputSourceInst::UniqueUse(atomic_load, 0) = input.inst {
if self.lower_ctx.data(atomic_load).opcode() == Opcode::AtomicLoad {
let atomic_addr = self.lower_ctx.input_as_value(atomic_load, 0);
return Some(SinkableAtomicLoad {

View File

@@ -497,7 +497,7 @@ where
#[inline]
fn sinkable_inst(&mut self, val: Value) -> Option<Inst> {
let input = self.lower_ctx.get_value_as_source_or_const(val);
if let Some((inst, 0)) = input.inst {
if let Some((inst, 0)) = input.inst.as_inst() {
return Some(inst);
}
None

View File

@@ -1367,11 +1367,9 @@
(decl cmp_and_choose (Type CC Value Value) ValueRegs)
(rule (cmp_and_choose (fits_in_64 ty) cc x y)
(let ((x_reg Gpr x)
(y_reg Gpr y)
(size OperandSize (raw_operand_size_of_type ty)))
(with_flags_reg (x64_cmp size x_reg y_reg)
(cmove ty cc y_reg x_reg))))
(let ((size OperandSize (raw_operand_size_of_type ty)))
(with_flags_reg (x64_cmp size x y)
(cmove ty cc y x))))
(rule (lower (has_type (fits_in_64 ty) (umin x y)))
(cmp_and_choose ty (CC.B) x y))
@@ -1751,19 +1749,8 @@
;; than one instruction for certain types (e.g., XMM-held, I128).
(rule (lower (has_type ty (select (icmp cc a @ (value_type (fits_in_64 a_ty)) b) x y)))
;; N.B.: we force the comparison operators into registers, and disallow
;; load-op fusion, because we do not have a transitive guarantee that this
;; cmp-site will be the sole user of the value. Consider: the `icmp` might
;; be the only user of a load, but there may be multiple users of the
;; `icmp` (e.g., `select` or `bint` instructions) that each invoke emit a
;; comparison. If we were to allow a load to sink to the *latest* one, but
;; other sites did not permit sinking, then we would be missing the load
;; for other cmp-sites. TODO:
;; https://github.com/bytecodealliance/wasmtime/issues/3953.
(let ((gpr_a Gpr (put_in_gpr a))
(gpr_b Gpr (put_in_gpr b))
(size OperandSize (raw_operand_size_of_type a_ty)))
(with_flags (x64_cmp size gpr_b gpr_a) (cmove_from_values ty cc x y))))
(let ((size OperandSize (raw_operand_size_of_type a_ty)))
(with_flags (x64_cmp size b a) (cmove_from_values ty cc x y))))
;; Finally, we lower `select` from a condition value `c`. These rules are meant
;; to be the final, default lowerings if no other patterns matched above.

View File

@@ -61,7 +61,7 @@ fn matches_input<C: LowerCtx<I = Inst>>(
op: Opcode,
) -> Option<IRInst> {
let inputs = ctx.get_input_as_source_or_const(input.insn, input.input);
inputs.inst.and_then(|(src_inst, _)| {
inputs.inst.as_inst().and_then(|(src_inst, _)| {
let data = ctx.data(src_inst);
if data.opcode() == op {
return Some(src_inst);
@@ -172,7 +172,7 @@ fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegM
return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
}
if let Some((src_insn, 0)) = inputs.inst {
if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
ctx.sink_inst(src_insn);
let amode = lower_to_amode(ctx, addr_input, offset);
@@ -479,22 +479,11 @@ fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntC
} else {
// TODO Try to commute the operands (and invert the condition) if one is an immediate.
let lhs = put_input_in_reg(ctx, inputs[0]);
// We force the RHS into a register, and disallow load-op fusion, because we
// do not have a transitive guarantee that this cmp-site will be the sole
// user of the value. Consider: the icmp might be the only user of a load,
// but there may be multiple users of the icmp (e.g. select or bint
// instructions) that each invoke `emit_cmp()`. If we were to allow a load
// to sink to the *latest* one, but other sites did not permit sinking, then
// we would be missing the load for other cmp-sites.
let rhs = put_input_in_reg(ctx, inputs[1]);
let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
// Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives
// us dst - src at the machine instruction level, so invert operands.
ctx.emit(Inst::cmp_rmi_r(
OperandSize::from_ty(ty),
RegMemImm::reg(rhs),
lhs,
));
ctx.emit(Inst::cmp_rmi_r(OperandSize::from_ty(ty), rhs, lhs));
cc
}
}
@@ -578,10 +567,8 @@ fn emit_fcmp<C: LowerCtx<I = Inst>>(
(inputs[0], inputs[1])
};
let lhs = put_input_in_reg(ctx, lhs_input);
// See above in `emit_cmp()`. We must only use the reg/reg form of the
// comparison in order to avoid issues with merged loads.
let rhs = put_input_in_reg(ctx, rhs_input);
ctx.emit(Inst::xmm_cmp_rm_r(op, RegMem::reg(rhs), lhs));
let rhs = input_to_reg_mem(ctx, rhs_input);
ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
let cond_result = match cond_code {
FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
@@ -2406,6 +2393,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let cmp_insn = ctx
.get_input_as_source_or_const(inputs[0].insn, inputs[0].input)
.inst
.as_inst()
.unwrap()
.0;
debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp);

View File

@@ -2,7 +2,7 @@
// Pull in the ISLE generated code.
pub(crate) mod generated_code;
use crate::machinst::{Reg, Writable};
use crate::machinst::{InputSourceInst, Reg, Writable};
use generated_code::MInst;
// Types that the generated ISLE code uses via `use super::*`.
@@ -84,7 +84,7 @@ where
return RegMemImm::reg(generated_code::constructor_imm(self, ty, c).unwrap());
}
if let Some((src_insn, 0)) = inputs.inst {
if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
self.lower_ctx.sink_inst(src_insn);
let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
@@ -105,7 +105,7 @@ where
return RegMem::reg(generated_code::constructor_imm(self, ty, c).unwrap());
}
if let Some((src_insn, 0)) = inputs.inst {
if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
self.lower_ctx.sink_inst(src_insn);
let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
@@ -237,7 +237,7 @@ where
fn sinkable_load(&mut self, val: Value) -> Option<SinkableLoad> {
let input = self.lower_ctx.get_value_as_source_or_const(val);
if let Some((inst, 0)) = input.inst {
if let InputSourceInst::UniqueUse(inst, 0) = input.inst {
if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, inst) {
return Some(SinkableLoad {
inst,

View File

@@ -1,4 +1,4 @@
src/clif.isle 443b34b797fc8ace
src/prelude.isle afd037c4d91c875c
src/isa/x64/inst.isle cad03431447aca1b
src/isa/x64/lower.isle 42bd3982a6132a2f
src/isa/x64/lower.isle 803aac716f6f4c39

File diff suppressed because it is too large Load Diff