Use andn for band_not when bmi1 is present (#5701)

We can use the andn instruction for the lowering of band_not on x64 when bmi1 is available.
This commit is contained in:
Trevor Elliott
2023-02-03 16:23:18 -08:00
committed by GitHub
parent 0ba1448fa4
commit 6d8f2be9e1
7 changed files with 141 additions and 3 deletions

View File

@@ -26,6 +26,15 @@
(src1_dst SyntheticAmode)
(src2 Gpr))
;; Integer arithmetic binary op that relies on the VEX prefix.
;; NOTE: we don't currently support emitting VEX instructions with memory
;; arguments, so `src2` is artificially constrained to be a Gpr.
(AluRmRVex (size OperandSize)
(op AluRmROpcode)
(src1 Gpr)
(src2 Gpr)
(dst WritableGpr))
;; Instructions on general-purpose registers that only read src and
;; defines dst (dst is not modified). `bsr`, etc.
(UnaryRmR (size OperandSize) ;; 2, 4, or 8
@@ -586,6 +595,9 @@
Xor
Mul))
(type AluRmROpcode extern
(enum Andn))
(type UnaryRmROpcode extern
(enum Bsr
Bsf
@@ -1837,6 +1849,18 @@
src1
src2))
;; Helper for emitting `MInst.AluRmRVex` instructions.
(decl alu_rm_r_vex (Type AluRmROpcode Gpr Gpr) Gpr)
(rule (alu_rm_r_vex ty opcode src1 src2)
(let ((dst WritableGpr (temp_writable_gpr))
(size OperandSize (operand_size_of_type_32_64 ty))
(_ Unit (emit (MInst.AluRmRVex size opcode src1 src2 dst))))
dst))
(decl x64_andn (Type Gpr Gpr) Gpr)
(rule (x64_andn ty src1 src2)
(alu_rm_r_vex ty (AluRmROpcode.Andn) src1 src2))
;; Helper for emitting immediates with an `i64` value. Note that
;; integer constants in ISLE are always parsed as `i128`s; this enables
;; negative numbers to be used as immediates.

View File

@@ -745,7 +745,7 @@ impl PrettyPrint for RegMem {
}
}
/// Some basic ALU operations. TODO: maybe add Adc, Sbb.
/// Some basic ALU operations.
#[derive(Copy, Clone, PartialEq)]
pub enum AluRmiROpcode {
/// Add operation.
@@ -788,6 +788,36 @@ impl fmt::Display for AluRmiROpcode {
}
}
/// ALU operations that don't accept intermediates.
#[derive(Copy, Clone, PartialEq)]
pub enum AluRmROpcode {
/// And with negated second operand.
Andn,
}
impl AluRmROpcode {
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
match self {
AluRmROpcode::Andn => smallvec![InstructionSet::BMI1],
}
}
}
impl fmt::Debug for AluRmROpcode {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
let name = match self {
AluRmROpcode::Andn => "andn",
};
write!(fmt, "{}", name)
}
}
impl fmt::Display for AluRmROpcode {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(self, f)
}
}
#[derive(Clone, PartialEq)]
/// Unary operations requiring register or memory and register operands.
pub enum UnaryRmROpcode {

View File

@@ -283,6 +283,40 @@ pub(crate) fn emit(
);
}
Inst::AluRmRVex {
size,
op,
dst,
src1,
src2,
} => {
use AluRmROpcode::*;
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = allocs.next(src2.to_reg());
let w = match size {
OperandSize::Size32 => false,
OperandSize::Size64 => true,
// the other cases would be rejected by isle constructors
_ => unreachable!(),
};
let opcode = match op {
Andn => 0xf2,
};
VexInstruction::new()
.map(OpcodeMap::_0F38)
.w(w)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2.to_real_reg().unwrap().hw_enc())
.opcode(opcode)
.encode(sink);
}
Inst::UnaryRmR { size, op, src, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
let rex_flags = RexFlags::from(*size);

View File

@@ -124,6 +124,7 @@ impl Inst {
| Inst::Unwind { .. }
| Inst::DummyUse { .. } => smallvec![],
Inst::AluRmRVex { op, .. } => op.available_from(),
Inst::UnaryRmR { op, .. } => op.available_from(),
// These use dynamic SSE opcodes.
@@ -747,6 +748,25 @@ impl PrettyPrint for Inst {
src1_dst,
)
}
Inst::AluRmRVex {
size,
op,
src1,
src2,
dst,
} => {
let size_bytes = size.to_bytes();
let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
let src1 = pretty_print_reg(src1.to_reg(), size_bytes, allocs);
let src2 = pretty_print_reg(src2.to_reg(), size_bytes, allocs);
format!(
"{} {}, {}, {}",
ljustify2(op.to_string(), String::new()),
dst,
src1,
src2,
)
}
Inst::UnaryRmR { src, dst, op, size } => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
let src = src.pretty_print(size.to_bytes(), allocs);
@@ -1754,6 +1774,13 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
collector.reg_use(src2.to_reg());
src1_dst.get_operands(collector);
}
Inst::AluRmRVex {
src1, src2, dst, ..
} => {
collector.reg_def(dst.to_writable_reg());
collector.reg_use(src1.to_reg());
collector.reg_use(src2.to_reg());
}
Inst::Not { src, dst, .. } => {
collector.reg_use(src.to_reg());
collector.reg_reuse_def(dst.to_writable_reg(), 0);

View File

@@ -1103,12 +1103,17 @@
(sse_and_not ty y x))
(rule 1 (lower (has_type ty (band_not x y)))
(rule 1 (lower (has_type ty @ (use_bmi1 $false) (band_not x y)))
(if (ty_int_ref_scalar_64 ty))
(x64_and ty
x
(x64_not ty y)))
(rule 1 (lower (has_type ty @ (use_bmi1 $true) (band_not x y)))
(if (ty_int_ref_scalar_64 ty))
;; the first argument is the one that gets inverted with andn
(x64_andn ty y x))
;;;; Rules for `bxor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -0,0 +1,17 @@
test compile precise-output
target x86_64 has_bmi1
function %f1(i8, i8) -> i8 {
block0(v0: i8, v1: i8):
v2 = band_not v0, v1
return v2
}
; pushq %rbp
; movq %rsp, %rbp
; block0:
; andn %eax, %esi, %edi
; movq %rbp, %rsp
; popq %rbp
; ret

View File

@@ -1,6 +1,7 @@
test interpret
test run
target x86_64
target x86_64 has_bmi1
target aarch64
target s390x