Split Fmla and Bsl out into new VecRRRMod op (#4638)

Separates the following opcodes for AArch64 into a separate `VecALUModOp` enum,
which is emitted via the `VecRRRMod` instruction. This separates vector ALU
instructions which modify a register from instructions which write to a new register:
- `Bsl`
- `Fmla`

Addresses [a discussion](https://github.com/bytecodealliance/wasmtime/pull/4608#discussion_r937975581) in #4608.

Copyright (c) 2022 Arm Limited
This commit is contained in:
Damian Heaton
2022-08-08 19:33:13 +01:00
committed by GitHub
parent 866ec46613
commit 47a67d752b
6 changed files with 88 additions and 49 deletions

View File

@@ -576,6 +576,14 @@
(rm Reg)
(size VectorSize))
;; A vector ALU op modifying a source register.
(VecRRRMod
(alu_op VecALUModOp)
(rd WritableReg)
(rn Reg)
(rm Reg)
(size VectorSize))
;; Vector two register miscellaneous instruction.
(VecMisc
(op VecMisc2)
@@ -1108,10 +1116,6 @@
(Orr)
;; Bitwise exclusive or
(Eor)
;; Bitwise select
;; This opcode should only be used with the `vec_rrr_inplace`
;; constructor.
(Bsl)
;; Unsigned maximum pairwise
(Umaxp)
;; Add
@@ -1146,10 +1150,6 @@
(Fmin)
;; Floating-point multiply
(Fmul)
;; Floating-point fused multiply-add vectors
;; This opcode should only be used with the `vec_rrr_inplace`
;; constructor.
(Fmla)
;; Add pairwise
(Addp)
;; Zip vectors (primary) [meaning, high halves]
@@ -1158,6 +1158,15 @@
(Sqrdmulh)
))
;; A Vector ALU operation which modifies a source register.
(type VecALUModOp
(enum
;; Bitwise select
(Bsl)
;; Floating-point fused multiply-add vectors
(Fmla)
))
;; A Vector miscellaneous operation with two registers.
(type VecMisc2
(enum
@@ -1508,11 +1517,11 @@
;; Helper for emitting `MInst.VecRRR` instructions which use three registers,
;; one of which is both source and output.
(decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg)
(rule (vec_rrr_inplace op src1 src2 src3 size)
(decl vec_rrr_mod (VecALUModOp Reg Reg Reg VectorSize) Reg)
(rule (vec_rrr_mod op src1 src2 src3 size)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_1 Unit (emit (MInst.FpuMove128 dst src1)))
(_2 Unit (emit (MInst.VecRRR op dst src2 src3 size))))
(_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size))))
dst))
;; Helper for emitting `MInst.FpuRRR` instructions.
@@ -2198,10 +2207,7 @@
(decl bsl (Type Reg Reg Reg) Reg)
(rule (bsl ty c x y)
(let ((dst WritableReg (temp_writable_reg ty))
(_ Unit (emit (MInst.FpuMove128 dst c)))
(_ Unit (emit (MInst.VecRRR (VecALUOp.Bsl) dst x y (vector_size ty)))))
dst))
(vec_rrr_mod (VecALUModOp.Bsl) c x y (vector_size ty)))
;; Helper for generating a `udf` instruction.

View File

@@ -752,6 +752,16 @@ impl VectorSize {
(q, size)
}
/// Return the encoding bit that is used by some floating-point SIMD
/// instructions for a particular operand size.
pub fn enc_float_size(&self) -> u32 {
match self.lane_size() {
ScalarSize::Size32 => 0b0,
ScalarSize::Size64 => 0b1,
size => panic!("Unsupported floating-point size for vector op: {:?}", size),
}
}
}
pub(crate) fn dynamic_to_fixed(ty: Type) -> Type {

View File

@@ -2543,17 +2543,9 @@ impl MachInstEmit for Inst {
| VecALUOp::Fdiv
| VecALUOp::Fmax
| VecALUOp::Fmin
| VecALUOp::Fmul
| VecALUOp::Fmla => true,
| VecALUOp::Fmul => true,
_ => false,
};
let enc_float_size = match (is_float, size) {
(true, VectorSize::Size32x2) => 0b0,
(true, VectorSize::Size32x4) => 0b0,
(true, VectorSize::Size64x2) => 0b1,
(true, _) => unimplemented!(),
_ => 0,
};
let (top11, bit15_10) = match alu_op {
VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
@@ -2574,7 +2566,6 @@ impl MachInstEmit for Inst {
VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
VecALUOp::Umaxp => {
debug_assert_ne!(size, VectorSize::Size64x2);
@@ -2619,7 +2610,6 @@ impl MachInstEmit for Inst {
VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
VecALUOp::Fmla => (0b000_01110_00_1, 0b110011),
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
VecALUOp::Sqrdmulh => {
@@ -2632,12 +2622,32 @@ impl MachInstEmit for Inst {
}
};
let top11 = if is_float {
top11 | enc_float_size << 1
top11 | size.enc_float_size() << 1
} else {
top11
};
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
}
&Inst::VecRRRMod {
rd,
rn,
rm,
alu_op,
size,
} => {
let rd = allocs.next_writable(rd);
let rn = allocs.next(rn);
let rm = allocs.next(rm);
let (q, _enc_size) = size.enc_size();
let (top11, bit15_10) = match alu_op {
VecALUModOp::Bsl => (0b001_01110_01_1, 0b000111),
VecALUModOp::Fmla => {
(0b000_01110_00_1 | (size.enc_float_size() << 1), 0b110011)
}
};
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
}
&Inst::VecLoadReplicate {
rd,
rn,

View File

@@ -3383,8 +3383,8 @@ fn test_aarch64_binemit() {
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Bsl,
Inst::VecRRRMod {
alu_op: VecALUModOp::Bsl,
rd: writable_vreg(8),
rn: vreg(9),
rm: vreg(1),
@@ -4055,8 +4055,8 @@ fn test_aarch64_binemit() {
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Fmla,
Inst::VecRRRMod {
alu_op: VecALUModOp::Fmla,
rd: writable_vreg(2),
rn: vreg(0),
rm: vreg(5),
@@ -4067,8 +4067,8 @@ fn test_aarch64_binemit() {
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Fmla,
Inst::VecRRRMod {
alu_op: VecALUModOp::Fmla,
rd: writable_vreg(2),
rn: vreg(0),
rm: vreg(5),
@@ -4079,8 +4079,8 @@ fn test_aarch64_binemit() {
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Fmla,
Inst::VecRRRMod {
alu_op: VecALUModOp::Fmla,
rd: writable_vreg(2),
rn: vreg(0),
rm: vreg(5),

View File

@@ -37,9 +37,9 @@ mod emit_tests;
pub use crate::isa::aarch64::lower::isle::generated_code::{
ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3,
FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUOp, VecExtendOp,
VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp, VecRRRLongOp,
VecShiftImmOp,
FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUModOp, VecALUOp,
VecExtendOp, VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp,
VecRRRLongOp, VecShiftImmOp,
};
/// A floating-point unit (FPU) operation with two args, a register and an immediate.
@@ -957,14 +957,13 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
collector.reg_def(rd);
collector.reg_use(rn);
}
&Inst::VecRRR {
alu_op, rd, rn, rm, ..
} => {
if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Fmla {
collector.reg_mod(rd);
} else {
&Inst::VecRRR { rd, rn, rm, .. } => {
collector.reg_def(rd);
collector.reg_use(rn);
collector.reg_use(rm);
}
&Inst::VecRRRMod { rd, rn, rm, .. } => {
collector.reg_mod(rd);
collector.reg_use(rn);
collector.reg_use(rm);
}
@@ -2208,7 +2207,6 @@ impl Inst {
VecALUOp::Bic => ("bic", VectorSize::Size8x16),
VecALUOp::Orr => ("orr", VectorSize::Size8x16),
VecALUOp::Eor => ("eor", VectorSize::Size8x16),
VecALUOp::Bsl => ("bsl", VectorSize::Size8x16),
VecALUOp::Umaxp => ("umaxp", size),
VecALUOp::Add => ("add", size),
VecALUOp::Sub => ("sub", size),
@@ -2226,7 +2224,6 @@ impl Inst {
VecALUOp::Fmax => ("fmax", size),
VecALUOp::Fmin => ("fmin", size),
VecALUOp::Fmul => ("fmul", size),
VecALUOp::Fmla => ("fmla", size),
VecALUOp::Addp => ("addp", size),
VecALUOp::Zip1 => ("zip1", size),
VecALUOp::Sqrdmulh => ("sqrdmulh", size),
@@ -2236,6 +2233,22 @@ impl Inst {
let rm = pretty_print_vreg_vector(rm, size, allocs);
format!("{} {}, {}, {}", op, rd, rn, rm)
}
&Inst::VecRRRMod {
rd,
rn,
rm,
alu_op,
size,
} => {
let (op, size) = match alu_op {
VecALUModOp::Bsl => ("bsl", VectorSize::Size8x16),
VecALUModOp::Fmla => ("fmla", size),
};
let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
let rn = pretty_print_vreg_vector(rn, size, allocs);
let rm = pretty_print_vreg_vector(rm, size, allocs);
format!("{} {}, {}, {}", op, rd, rn, rm)
}
&Inst::VecRRRLong {
rd,
rn,

View File

@@ -380,7 +380,7 @@
;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
(vec_rrr_inplace (VecALUOp.Fmla) z x y (vector_size ty)))
(vec_rrr_mod (VecALUModOp.Fmla) z x y (vector_size ty)))
(rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
(fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))