x64 backend: add lowerings with load-op-store fusion. (#4071)

x64 backend: add lowerings with load-op-store fusion.

These lowerings use the `OP [mem], reg` forms (or in AT&T syntax, `OP
%reg, (mem)`) -- i.e., x86 instructions that load from memory, perform
an ALU operation, and store the result, all in one instruction. Using
these instruction forms, we can merge three CLIF ops together: a load,
an arithmetic operation, and a store.
This commit is contained in:
Chris Fallin
2022-04-26 18:58:26 -07:00
committed by GitHub
parent 164bfeaf7e
commit dd45f44511
9 changed files with 1442 additions and 298 deletions

View File

@@ -20,6 +20,12 @@
(src2 GprMemImm)
(dst WritableGpr))
;; Integer arithmetic read-modify-write on memory.
(AluRM (size OperandSize) ;; 4 or 8
(op AluRmiROpcode)
(src1_dst SyntheticAmode)
(src2 Gpr))
;; Instructions on general-purpose registers that only read src and
;; defines dst (dst is not modified). `bsr`, etc.
(UnaryRmR (size OperandSize) ;; 2, 4, or 8
@@ -2695,6 +2701,32 @@
(decl x64_pcmpgtq (Xmm XmmMem) Xmm)
(rule (x64_pcmpgtq x y) (xmm_rm_r $I64X2 (SseOpcode.Pcmpgtq) x y))
;; Helpers for read-modify-write ALU form (AluRM).
(decl alu_rm (Type AluRmiROpcode Amode Gpr) SideEffectNoResult)
(rule (alu_rm ty opcode src1_dst src2)
(let ((size OperandSize (operand_size_of_type_32_64 ty)))
(SideEffectNoResult.Inst (MInst.AluRM size opcode src1_dst src2))))
(decl x64_add_mem (Type Amode Gpr) SideEffectNoResult)
(rule (x64_add_mem ty addr val)
(alu_rm ty (AluRmiROpcode.Add) addr val))
(decl x64_sub_mem (Type Amode Gpr) SideEffectNoResult)
(rule (x64_sub_mem ty addr val)
(alu_rm ty (AluRmiROpcode.Sub) addr val))
(decl x64_and_mem (Type Amode Gpr) SideEffectNoResult)
(rule (x64_and_mem ty addr val)
(alu_rm ty (AluRmiROpcode.And) addr val))
(decl x64_or_mem (Type Amode Gpr) SideEffectNoResult)
(rule (x64_or_mem ty addr val)
(alu_rm ty (AluRmiROpcode.Or) addr val))
(decl x64_xor_mem (Type Amode Gpr) SideEffectNoResult)
(rule (x64_xor_mem ty addr val)
(alu_rm ty (AluRmiROpcode.Xor) addr val))
;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(convert Gpr InstOutput output_gpr)

View File

@@ -270,6 +270,38 @@ pub(crate) fn emit(
}
}
Inst::AluRM {
size,
src1_dst,
src2,
op,
} => {
let src2 = allocs.next(src2.to_reg());
let src1_dst = src1_dst.finalize(state, sink).with_allocs(allocs);
assert!(*size == OperandSize::Size32 || *size == OperandSize::Size64);
let opcode = match op {
AluRmiROpcode::Add => 0x01,
AluRmiROpcode::Sub => 0x29,
AluRmiROpcode::And => 0x21,
AluRmiROpcode::Or => 0x09,
AluRmiROpcode::Xor => 0x31,
_ => panic!("Unsupported read-modify-write ALU opcode"),
};
let enc_g = int_reg_enc(src2);
emit_std_enc_mem(
sink,
state,
info,
LegacyPrefixes::None,
opcode,
1,
enc_g,
&src1_dst,
RexFlags::from(*size),
);
}
Inst::UnaryRmR { size, op, src, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let rex_flags = RexFlags::from(*size);

View File

@@ -1500,6 +1500,125 @@ fn test_x64_emit() {
"imull %esi, $76543210, %esi",
));
// ========================================================
// AluRM
insns.push((
Inst::AluRM {
size: OperandSize::Size32,
op: AluRmiROpcode::Add,
src1_dst: Amode::imm_reg(99, rdi).into(),
src2: Gpr::new(r12).unwrap(),
},
"44016763",
"addl %r12d, 99(%rdi)",
));
insns.push((
Inst::AluRM {
size: OperandSize::Size64,
op: AluRmiROpcode::Add,
src1_dst: Amode::imm_reg_reg_shift(
0,
Gpr::new(rbp).unwrap(),
Gpr::new(rax).unwrap(),
3,
)
.into(),
src2: Gpr::new(rax).unwrap(),
},
"480144C500",
"addq %rax, 0(%rbp,%rax,8)",
));
insns.push((
Inst::AluRM {
size: OperandSize::Size32,
op: AluRmiROpcode::Sub,
src1_dst: Amode::imm_reg(0, rsp).into(),
src2: Gpr::new(rcx).unwrap(),
},
"290C24",
"subl %ecx, 0(%rsp)",
));
insns.push((
Inst::AluRM {
size: OperandSize::Size64,
op: AluRmiROpcode::Sub,
src1_dst: Amode::imm_reg(0, rbp).into(),
src2: Gpr::new(rax).unwrap(),
},
"48294500",
"subq %rax, 0(%rbp)",
));
insns.push((
Inst::AluRM {
size: OperandSize::Size32,
op: AluRmiROpcode::And,
src1_dst: Amode::imm_reg(0, rsp).into(),
src2: Gpr::new(rcx).unwrap(),
},
"210C24",
"andl %ecx, 0(%rsp)",
));
insns.push((
Inst::AluRM {
size: OperandSize::Size64,
op: AluRmiROpcode::And,
src1_dst: Amode::imm_reg(0, rbp).into(),
src2: Gpr::new(rax).unwrap(),
},
"48214500",
"andq %rax, 0(%rbp)",
));
insns.push((
Inst::AluRM {
size: OperandSize::Size32,
op: AluRmiROpcode::Or,
src1_dst: Amode::imm_reg(0, rsp).into(),
src2: Gpr::new(rcx).unwrap(),
},
"090C24",
"orl %ecx, 0(%rsp)",
));
insns.push((
Inst::AluRM {
size: OperandSize::Size64,
op: AluRmiROpcode::Or,
src1_dst: Amode::imm_reg(0, rbp).into(),
src2: Gpr::new(rax).unwrap(),
},
"48094500",
"orq %rax, 0(%rbp)",
));
insns.push((
Inst::AluRM {
size: OperandSize::Size32,
op: AluRmiROpcode::Xor,
src1_dst: Amode::imm_reg(0, rsp).into(),
src2: Gpr::new(rcx).unwrap(),
},
"310C24",
"xorl %ecx, 0(%rsp)",
));
insns.push((
Inst::AluRM {
size: OperandSize::Size64,
op: AluRmiROpcode::Xor,
src1_dst: Amode::imm_reg(0, rbp).into(),
src2: Gpr::new(rax).unwrap(),
},
"48314500",
"xorq %rax, 0(%rbp)",
));
// ========================================================
// UnaryRmR

View File

@@ -44,6 +44,7 @@ impl Inst {
// These instructions are part of SSE2, which is a basic requirement in Cranelift, and
// don't have to be checked.
Inst::AluRmiR { .. }
| Inst::AluRM { .. }
| Inst::AtomicRmwSeq { .. }
| Inst::CallKnown { .. }
| Inst::CallUnknown { .. }
@@ -917,6 +918,22 @@ impl PrettyPrint for Inst {
dst
)
}
Inst::AluRM {
size,
op,
src1_dst,
src2,
} => {
let size_bytes = size_lqb(*size, op.is_8bit());
let src2 = pretty_print_reg(src2.to_reg(), size_bytes, allocs);
let src1_dst = src1_dst.pretty_print(size_bytes, allocs);
format!(
"{} {}, {}",
ljustify2(op.to_string(), suffix_lqb(*size, op.is_8bit())),
src2,
src1_dst,
)
}
Inst::UnaryRmR { src, dst, op, size } => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
let src = src.pretty_print(size.to_bytes(), allocs);
@@ -1691,6 +1708,10 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
src2.get_operands(collector);
}
}
Inst::AluRM { src1_dst, src2, .. } => {
collector.reg_use(src2.to_reg());
src1_dst.get_operands(collector);
}
Inst::Not { src, dst, .. } => {
collector.reg_use(src.to_reg());
collector.reg_reuse_def(dst.to_writable_reg(), 0);

View File

@@ -2655,3 +2655,131 @@
(side_effect_concat
(x64_movrm $I64 addr_lo value_lo)
(x64_movrm $I64 addr_hi value_hi)))))
;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Add mem, reg
(rule (lower
(store =flags
(has_type (ty_32_or_64 ty)
(iadd (and
(sinkable_load sink)
(load flags addr offset))
src2))
=addr
=offset))
(let ((_ RegMemImm (sink_load sink)))
(side_effect
(x64_add_mem ty (to_amode flags addr offset) src2))))
;; Add mem, reg with args swapped
(rule (lower
(store =flags
(has_type (ty_32_or_64 ty)
(iadd src2
(and
(sinkable_load sink)
(load flags addr offset))))
=addr
=offset))
(let ((_ RegMemImm (sink_load sink)))
(side_effect
(x64_add_mem ty (to_amode flags addr offset) src2))))
;; Sub mem, reg
(rule (lower
(store =flags
(has_type (ty_32_or_64 ty)
(isub (and
(sinkable_load sink)
(load flags addr offset))
src2))
=addr
=offset))
(let ((_ RegMemImm (sink_load sink)))
(side_effect
(x64_sub_mem ty (to_amode flags addr offset) src2))))
;; And mem, reg
(rule (lower
(store =flags
(has_type (ty_32_or_64 ty)
(band (and
(sinkable_load sink)
(load flags addr offset))
src2))
=addr
=offset))
(let ((_ RegMemImm (sink_load sink)))
(side_effect
(x64_and_mem ty (to_amode flags addr offset) src2))))
;; And mem, reg with args swapped
(rule (lower
(store =flags
(has_type (ty_32_or_64 ty)
(band src2
(and
(sinkable_load sink)
(load flags addr offset))))
=addr
=offset))
(let ((_ RegMemImm (sink_load sink)))
(side_effect
(x64_and_mem ty (to_amode flags addr offset) src2))))
;; Or mem, reg
(rule (lower
(store =flags
(has_type (ty_32_or_64 ty)
(bor (and
(sinkable_load sink)
(load flags addr offset))
src2))
=addr
=offset))
(let ((_ RegMemImm (sink_load sink)))
(side_effect
(x64_or_mem ty (to_amode flags addr offset) src2))))
;; Or mem, reg with args swapped
(rule (lower
(store =flags
(has_type (ty_32_or_64 ty)
(bor src2
(and
(sinkable_load sink)
(load flags addr offset))))
=addr
=offset))
(let ((_ RegMemImm (sink_load sink)))
(side_effect
(x64_or_mem ty (to_amode flags addr offset) src2))))
;; Xor mem, reg
(rule (lower
(store =flags
(has_type (ty_32_or_64 ty)
(bxor (and
(sinkable_load sink)
(load flags addr offset))
src2))
=addr
=offset))
(let ((_ RegMemImm (sink_load sink)))
(side_effect
(x64_xor_mem ty (to_amode flags addr offset) src2))))
;; Xor mem, reg with args swapped
(rule (lower
(store =flags
(has_type (ty_32_or_64 ty)
(bxor src2
(and
(sinkable_load sink)
(load flags addr offset))))
=addr
=offset))
(let ((_ RegMemImm (sink_load sink)))
(side_effect
(x64_xor_mem ty (to_amode flags addr offset) src2))))

View File

@@ -1,4 +1,4 @@
src/clif.isle 443b34b797fc8ace
src/prelude.isle d8a93eb727abd7f4
src/isa/x64/inst.isle 2fa48b8183f9d5cb
src/isa/x64/lower.isle b7fe1c95c21edbe4
src/isa/x64/inst.isle 6dcba190988a695
src/isa/x64/lower.isle b95161bdf07b9365

File diff suppressed because it is too large Load Diff