Merge pull request #2589 from akirilov-arm/popcnt
Cranelift AArch64: Improve the Popcnt implementation
This commit is contained in:
@@ -601,6 +601,14 @@ impl ScalarSize {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert from an integer operand size.
|
||||||
|
pub fn from_operand_size(size: OperandSize) -> ScalarSize {
|
||||||
|
match size {
|
||||||
|
OperandSize::Size32 => ScalarSize::Size32,
|
||||||
|
OperandSize::Size64 => ScalarSize::Size64,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert from a type into the smallest size that fits.
|
/// Convert from a type into the smallest size that fits.
|
||||||
pub fn from_ty(ty: Type) -> ScalarSize {
|
pub fn from_ty(ty: Type) -> ScalarSize {
|
||||||
Self::from_bits(ty_bits(ty))
|
Self::from_bits(ty_bits(ty))
|
||||||
|
|||||||
@@ -1463,12 +1463,18 @@ impl MachInstEmit for Inst {
|
|||||||
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
|
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
|
||||||
(0b0, 0b11000, enc_size | 0b10)
|
(0b0, 0b11000, enc_size | 0b10)
|
||||||
}
|
}
|
||||||
|
VecMisc2::Cnt => {
|
||||||
|
debug_assert!(size == VectorSize::Size8x8 || size == VectorSize::Size8x16);
|
||||||
|
(0b0, 0b00101, enc_size)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
|
sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
|
||||||
}
|
}
|
||||||
&Inst::VecLanes { op, rd, rn, size } => {
|
&Inst::VecLanes { op, rd, rn, size } => {
|
||||||
let (q, size) = match size {
|
let (q, size) = match size {
|
||||||
|
VectorSize::Size8x8 => (0b0, 0b00),
|
||||||
VectorSize::Size8x16 => (0b1, 0b00),
|
VectorSize::Size8x16 => (0b1, 0b00),
|
||||||
|
VectorSize::Size16x4 => (0b0, 0b01),
|
||||||
VectorSize::Size16x8 => (0b1, 0b01),
|
VectorSize::Size16x8 => (0b1, 0b01),
|
||||||
VectorSize::Size32x4 => (0b1, 0b10),
|
VectorSize::Size32x4 => (0b1, 0b10),
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
|
|||||||
@@ -3792,6 +3792,28 @@ fn test_aarch64_binemit() {
|
|||||||
"frintp v12.2d, v17.2d",
|
"frintp v12.2d, v17.2d",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecMisc {
|
||||||
|
op: VecMisc2::Cnt,
|
||||||
|
rd: writable_vreg(23),
|
||||||
|
rn: vreg(5),
|
||||||
|
size: VectorSize::Size8x8,
|
||||||
|
},
|
||||||
|
"B758200E",
|
||||||
|
"cnt v23.8b, v5.8b",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecLanes {
|
||||||
|
op: VecLanesOp::Uminv,
|
||||||
|
rd: writable_vreg(0),
|
||||||
|
rn: vreg(31),
|
||||||
|
size: VectorSize::Size8x8,
|
||||||
|
},
|
||||||
|
"E0AB312E",
|
||||||
|
"uminv b0, v31.8b",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecLanes {
|
Inst::VecLanes {
|
||||||
op: VecLanesOp::Uminv,
|
op: VecLanesOp::Uminv,
|
||||||
@@ -3836,6 +3858,17 @@ fn test_aarch64_binemit() {
|
|||||||
"addv b2, v29.16b",
|
"addv b2, v29.16b",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecLanes {
|
||||||
|
op: VecLanesOp::Addv,
|
||||||
|
rd: writable_vreg(15),
|
||||||
|
rn: vreg(7),
|
||||||
|
size: VectorSize::Size16x4,
|
||||||
|
},
|
||||||
|
"EFB8710E",
|
||||||
|
"addv h15, v7.4h",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::VecLanes {
|
Inst::VecLanes {
|
||||||
op: VecLanesOp::Addv,
|
op: VecLanesOp::Addv,
|
||||||
|
|||||||
@@ -331,6 +331,8 @@ pub enum VecMisc2 {
|
|||||||
Frintm,
|
Frintm,
|
||||||
/// Floating point round to integral, rounding towards plus infinity
|
/// Floating point round to integral, rounding towards plus infinity
|
||||||
Frintp,
|
Frintp,
|
||||||
|
/// Population count per byte
|
||||||
|
Cnt,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A Vector narrowing operation with two registers.
|
/// A Vector narrowing operation with two registers.
|
||||||
@@ -3752,6 +3754,7 @@ impl Inst {
|
|||||||
VecMisc2::Frintz => ("frintz", size),
|
VecMisc2::Frintz => ("frintz", size),
|
||||||
VecMisc2::Frintm => ("frintm", size),
|
VecMisc2::Frintm => ("frintm", size),
|
||||||
VecMisc2::Frintp => ("frintp", size),
|
VecMisc2::Frintp => ("frintp", size),
|
||||||
|
VecMisc2::Cnt => ("cnt", size),
|
||||||
};
|
};
|
||||||
|
|
||||||
let rd_size = if is_shll { size.widen() } else { size };
|
let rd_size = if is_shll { size.widen() } else { size };
|
||||||
|
|||||||
@@ -962,143 +962,57 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Popcnt => {
|
Opcode::Popcnt => {
|
||||||
// Lower popcount using the following algorithm:
|
|
||||||
//
|
|
||||||
// x -= (x >> 1) & 0x5555555555555555
|
|
||||||
// x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333)
|
|
||||||
// x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f
|
|
||||||
// x += x << 8
|
|
||||||
// x += x << 16
|
|
||||||
// x += x << 32
|
|
||||||
// x >> 56
|
|
||||||
let ty = ty.unwrap();
|
|
||||||
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
// FIXME(#1537): zero-extend 8/16/32-bit operands only to 32 bits,
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||||
// and fix the sequence below to work properly for this.
|
let ty = ty.unwrap();
|
||||||
let narrow_mode = NarrowValueMode::ZeroExtend64;
|
let size = ScalarSize::from_operand_size(OperandSize::from_ty(ty));
|
||||||
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
|
||||||
let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
|
|
||||||
|
|
||||||
// If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
|
// fmov tmp, rn
|
||||||
// the rest of the code is identical to the 64-bit version.
|
// cnt tmp.8b, tmp.8b
|
||||||
// lsr [wx]d, [wx]n, #1
|
// addp tmp.8b, tmp.8b, tmp.8b / addv tmp, tmp.8b / (no instruction for 8-bit inputs)
|
||||||
ctx.emit(Inst::AluRRImmShift {
|
// umov rd, tmp.b[0]
|
||||||
alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
|
|
||||||
rd: rd,
|
ctx.emit(Inst::MovToFpu {
|
||||||
|
rd: tmp,
|
||||||
rn: rn,
|
rn: rn,
|
||||||
immshift: ImmShift::maybe_from_u64(1).unwrap(),
|
size,
|
||||||
});
|
});
|
||||||
|
ctx.emit(Inst::VecMisc {
|
||||||
// and xd, xd, #0x5555555555555555
|
op: VecMisc2::Cnt,
|
||||||
ctx.emit(Inst::AluRRImmLogic {
|
|
||||||
alu_op: ALUOp::And64,
|
|
||||||
rd: rd,
|
|
||||||
rn: rd.to_reg(),
|
|
||||||
imml: ImmLogic::maybe_from_u64(0x5555555555555555, I64).unwrap(),
|
|
||||||
});
|
|
||||||
|
|
||||||
// sub xd, xn, xd
|
|
||||||
ctx.emit(Inst::AluRRR {
|
|
||||||
alu_op: ALUOp::Sub64,
|
|
||||||
rd: rd,
|
|
||||||
rn: rn,
|
|
||||||
rm: rd.to_reg(),
|
|
||||||
});
|
|
||||||
|
|
||||||
// and xt, xd, #0x3333333333333333
|
|
||||||
ctx.emit(Inst::AluRRImmLogic {
|
|
||||||
alu_op: ALUOp::And64,
|
|
||||||
rd: tmp,
|
rd: tmp,
|
||||||
rn: rd.to_reg(),
|
rn: tmp.to_reg(),
|
||||||
imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(),
|
size: VectorSize::Size8x8,
|
||||||
});
|
});
|
||||||
|
|
||||||
// lsr xd, xd, #2
|
match ScalarSize::from_ty(ty) {
|
||||||
ctx.emit(Inst::AluRRImmShift {
|
ScalarSize::Size8 => {}
|
||||||
alu_op: ALUOp::Lsr64,
|
ScalarSize::Size16 => {
|
||||||
rd: rd,
|
// ADDP is usually cheaper than ADDV.
|
||||||
rn: rd.to_reg(),
|
ctx.emit(Inst::VecRRR {
|
||||||
immshift: ImmShift::maybe_from_u64(2).unwrap(),
|
alu_op: VecALUOp::Addp,
|
||||||
});
|
|
||||||
|
|
||||||
// and xd, xd, #0x3333333333333333
|
|
||||||
ctx.emit(Inst::AluRRImmLogic {
|
|
||||||
alu_op: ALUOp::And64,
|
|
||||||
rd: rd,
|
|
||||||
rn: rd.to_reg(),
|
|
||||||
imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(),
|
|
||||||
});
|
|
||||||
|
|
||||||
// add xt, xd, xt
|
|
||||||
ctx.emit(Inst::AluRRR {
|
|
||||||
alu_op: ALUOp::Add64,
|
|
||||||
rd: tmp,
|
|
||||||
rn: rd.to_reg(),
|
|
||||||
rm: tmp.to_reg(),
|
|
||||||
});
|
|
||||||
|
|
||||||
// add xt, xt, xt LSR #4
|
|
||||||
ctx.emit(Inst::AluRRRShift {
|
|
||||||
alu_op: ALUOp::Add64,
|
|
||||||
rd: tmp,
|
rd: tmp,
|
||||||
rn: tmp.to_reg(),
|
rn: tmp.to_reg(),
|
||||||
rm: tmp.to_reg(),
|
rm: tmp.to_reg(),
|
||||||
shiftop: ShiftOpAndAmt::new(
|
size: VectorSize::Size8x8,
|
||||||
ShiftOp::LSR,
|
|
||||||
ShiftOpShiftImm::maybe_from_shift(4).unwrap(),
|
|
||||||
),
|
|
||||||
});
|
});
|
||||||
|
}
|
||||||
// and xt, xt, #0x0f0f0f0f0f0f0f0f
|
ScalarSize::Size32 | ScalarSize::Size64 => {
|
||||||
ctx.emit(Inst::AluRRImmLogic {
|
ctx.emit(Inst::VecLanes {
|
||||||
alu_op: ALUOp::And64,
|
op: VecLanesOp::Addv,
|
||||||
rd: tmp,
|
rd: tmp,
|
||||||
rn: tmp.to_reg(),
|
rn: tmp.to_reg(),
|
||||||
imml: ImmLogic::maybe_from_u64(0x0f0f0f0f0f0f0f0f, I64).unwrap(),
|
size: VectorSize::Size8x8,
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
sz => panic!("Unexpected scalar FP operand size: {:?}", sz),
|
||||||
|
}
|
||||||
|
|
||||||
// add xt, xt, xt, LSL #8
|
ctx.emit(Inst::MovFromVec {
|
||||||
ctx.emit(Inst::AluRRRShift {
|
rd,
|
||||||
alu_op: ALUOp::Add64,
|
|
||||||
rd: tmp,
|
|
||||||
rn: tmp.to_reg(),
|
rn: tmp.to_reg(),
|
||||||
rm: tmp.to_reg(),
|
idx: 0,
|
||||||
shiftop: ShiftOpAndAmt::new(
|
size: VectorSize::Size8x16,
|
||||||
ShiftOp::LSL,
|
|
||||||
ShiftOpShiftImm::maybe_from_shift(8).unwrap(),
|
|
||||||
),
|
|
||||||
});
|
|
||||||
|
|
||||||
// add xt, xt, xt, LSL #16
|
|
||||||
ctx.emit(Inst::AluRRRShift {
|
|
||||||
alu_op: ALUOp::Add64,
|
|
||||||
rd: tmp,
|
|
||||||
rn: tmp.to_reg(),
|
|
||||||
rm: tmp.to_reg(),
|
|
||||||
shiftop: ShiftOpAndAmt::new(
|
|
||||||
ShiftOp::LSL,
|
|
||||||
ShiftOpShiftImm::maybe_from_shift(16).unwrap(),
|
|
||||||
),
|
|
||||||
});
|
|
||||||
|
|
||||||
// add xt, xt, xt, LSL #32
|
|
||||||
ctx.emit(Inst::AluRRRShift {
|
|
||||||
alu_op: ALUOp::Add64,
|
|
||||||
rd: tmp,
|
|
||||||
rn: tmp.to_reg(),
|
|
||||||
rm: tmp.to_reg(),
|
|
||||||
shiftop: ShiftOpAndAmt::new(
|
|
||||||
ShiftOp::LSL,
|
|
||||||
ShiftOpShiftImm::maybe_from_shift(32).unwrap(),
|
|
||||||
),
|
|
||||||
});
|
|
||||||
|
|
||||||
// lsr xd, xt, #56
|
|
||||||
ctx.emit(Inst::AluRRImmShift {
|
|
||||||
alu_op: ALUOp::Lsr64,
|
|
||||||
rd: rd,
|
|
||||||
rn: tmp.to_reg(),
|
|
||||||
immshift: ImmShift::maybe_from_u64(56).unwrap(),
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -230,19 +230,10 @@ block0(v0: i64):
|
|||||||
|
|
||||||
; check: stp fp, lr, [sp, #-16]!
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
; nextln: mov fp, sp
|
; nextln: mov fp, sp
|
||||||
; nextln: lsr x1, x0, #1
|
; nextln: fmov d0, x0
|
||||||
; nextln: and x1, x1, #6148914691236517205
|
; nextln: cnt v0.8b, v0.8b
|
||||||
; nextln: sub x1, x0, x1
|
; nextln: addv b0, v0.8b
|
||||||
; nextln: and x0, x1, #3689348814741910323
|
; nextln: umov w0, v0.b[0]
|
||||||
; nextln: lsr x1, x1, #2
|
|
||||||
; nextln: and x1, x1, #3689348814741910323
|
|
||||||
; nextln: add x0, x1, x0
|
|
||||||
; nextln: add x0, x0, x0, LSR 4
|
|
||||||
; nextln: and x0, x0, #1085102592571150095
|
|
||||||
; nextln: add x0, x0, x0, LSL 8
|
|
||||||
; nextln: add x0, x0, x0, LSL 16
|
|
||||||
; nextln: add x0, x0, x0, LSL 32
|
|
||||||
; nextln: lsr x0, x0, #56
|
|
||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
@@ -255,20 +246,10 @@ block0(v0: i32):
|
|||||||
|
|
||||||
; check: stp fp, lr, [sp, #-16]!
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
; nextln: mov fp, sp
|
; nextln: mov fp, sp
|
||||||
; nextln: mov w0, w0
|
; nextln: fmov s0, w0
|
||||||
; nextln: lsr w1, w0, #1
|
; nextln: cnt v0.8b, v0.8b
|
||||||
; nextln: and x1, x1, #6148914691236517205
|
; nextln: addv b0, v0.8b
|
||||||
; nextln: sub x1, x0, x1
|
; nextln: umov w0, v0.b[0]
|
||||||
; nextln: and x0, x1, #3689348814741910323
|
|
||||||
; nextln: lsr x1, x1, #2
|
|
||||||
; nextln: and x1, x1, #3689348814741910323
|
|
||||||
; nextln: add x0, x1, x0
|
|
||||||
; nextln: add x0, x0, x0, LSR 4
|
|
||||||
; nextln: and x0, x0, #1085102592571150095
|
|
||||||
; nextln: add x0, x0, x0, LSL 8
|
|
||||||
; nextln: add x0, x0, x0, LSL 16
|
|
||||||
; nextln: add x0, x0, x0, LSL 32
|
|
||||||
; nextln: lsr x0, x0, #56
|
|
||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
@@ -281,20 +262,10 @@ block0(v0: i16):
|
|||||||
|
|
||||||
; check: stp fp, lr, [sp, #-16]!
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
; nextln: mov fp, sp
|
; nextln: mov fp, sp
|
||||||
; nextln: uxth w0, w0
|
; nextln: fmov s0, w0
|
||||||
; nextln: lsr w1, w0, #1
|
; nextln: cnt v0.8b, v0.8b
|
||||||
; nextln: and x1, x1, #6148914691236517205
|
; nextln: addp v0.8b, v0.8b, v0.8b
|
||||||
; nextln: sub x1, x0, x1
|
; nextln: umov w0, v0.b[0]
|
||||||
; nextln: and x0, x1, #3689348814741910323
|
|
||||||
; nextln: lsr x1, x1, #2
|
|
||||||
; nextln: and x1, x1, #3689348814741910323
|
|
||||||
; nextln: add x0, x1, x0
|
|
||||||
; nextln: add x0, x0, x0, LSR 4
|
|
||||||
; nextln: and x0, x0, #1085102592571150095
|
|
||||||
; nextln: add x0, x0, x0, LSL 8
|
|
||||||
; nextln: add x0, x0, x0, LSL 16
|
|
||||||
; nextln: add x0, x0, x0, LSL 32
|
|
||||||
; nextln: lsr x0, x0, #56
|
|
||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
@@ -307,20 +278,9 @@ block0(v0: i8):
|
|||||||
|
|
||||||
; check: stp fp, lr, [sp, #-16]!
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
; nextln: mov fp, sp
|
; nextln: mov fp, sp
|
||||||
; nextln: uxtb w0, w0
|
; nextln: fmov s0, w0
|
||||||
; nextln: lsr w1, w0, #1
|
; nextln: cnt v0.8b, v0.8b
|
||||||
; nextln: and x1, x1, #6148914691236517205
|
; nextln: umov w0, v0.b[0]
|
||||||
; nextln: sub x1, x0, x1
|
|
||||||
; nextln: and x0, x1, #3689348814741910323
|
|
||||||
; nextln: lsr x1, x1, #2
|
|
||||||
; nextln: and x1, x1, #3689348814741910323
|
|
||||||
; nextln: add x0, x1, x0
|
|
||||||
; nextln: add x0, x0, x0, LSR 4
|
|
||||||
; nextln: and x0, x0, #1085102592571150095
|
|
||||||
; nextln: add x0, x0, x0, LSL 8
|
|
||||||
; nextln: add x0, x0, x0, LSL 16
|
|
||||||
; nextln: add x0, x0, x0, LSL 32
|
|
||||||
; nextln: lsr x0, x0, #56
|
|
||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
|
|||||||
Reference in New Issue
Block a user