Split Fmla and Bsl out into new VecRRRMod op (#4638)

Separates the following opcodes for AArch64 into a separate `VecALUModOp` enum,
which is emitted via the `VecRRRMod` instruction. This separates vector ALU
instructions which modify a register from instructions which write to a new register:
- `Bsl`
- `Fmla`

Addresses [a discussion](https://github.com/bytecodealliance/wasmtime/pull/4608#discussion_r937975581) in #4608.

Copyright (c) 2022 Arm Limited
This commit is contained in:
Damian Heaton
2022-08-08 19:33:13 +01:00
committed by GitHub
parent 866ec46613
commit 47a67d752b
6 changed files with 88 additions and 49 deletions

View File

@@ -576,6 +576,14 @@
(rm Reg) (rm Reg)
(size VectorSize)) (size VectorSize))
;; A vector ALU op modifying a source register.
(VecRRRMod
(alu_op VecALUModOp)
(rd WritableReg)
(rn Reg)
(rm Reg)
(size VectorSize))
;; Vector two register miscellaneous instruction. ;; Vector two register miscellaneous instruction.
(VecMisc (VecMisc
(op VecMisc2) (op VecMisc2)
@@ -1108,10 +1116,6 @@
(Orr) (Orr)
;; Bitwise exclusive or ;; Bitwise exclusive or
(Eor) (Eor)
;; Bitwise select
;; This opcode should only be used with the `vec_rrr_inplace`
;; constructor.
(Bsl)
;; Unsigned maximum pairwise ;; Unsigned maximum pairwise
(Umaxp) (Umaxp)
;; Add ;; Add
@@ -1146,10 +1150,6 @@
(Fmin) (Fmin)
;; Floating-point multiply ;; Floating-point multiply
(Fmul) (Fmul)
;; Floating-point fused multiply-add vectors
;; This opcode should only be used with the `vec_rrr_inplace`
;; constructor.
(Fmla)
;; Add pairwise ;; Add pairwise
(Addp) (Addp)
;; Zip vectors (primary) [meaning, high halves] ;; Zip vectors (primary) [meaning, high halves]
@@ -1158,6 +1158,15 @@
(Sqrdmulh) (Sqrdmulh)
)) ))
;; A Vector ALU operation which modifies a source register.
(type VecALUModOp
(enum
;; Bitwise select
(Bsl)
;; Floating-point fused multiply-add vectors
(Fmla)
))
;; A Vector miscellaneous operation with two registers. ;; A Vector miscellaneous operation with two registers.
(type VecMisc2 (type VecMisc2
(enum (enum
@@ -1508,11 +1517,11 @@
;; Helper for emitting `MInst.VecRRR` instructions which use three registers, ;; Helper for emitting `MInst.VecRRR` instructions which use three registers,
;; one of which is both source and output. ;; one of which is both source and output.
(decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg) (decl vec_rrr_mod (VecALUModOp Reg Reg Reg VectorSize) Reg)
(rule (vec_rrr_inplace op src1 src2 src3 size) (rule (vec_rrr_mod op src1 src2 src3 size)
(let ((dst WritableReg (temp_writable_reg $I8X16)) (let ((dst WritableReg (temp_writable_reg $I8X16))
(_1 Unit (emit (MInst.FpuMove128 dst src1))) (_1 Unit (emit (MInst.FpuMove128 dst src1)))
(_2 Unit (emit (MInst.VecRRR op dst src2 src3 size)))) (_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size))))
dst)) dst))
;; Helper for emitting `MInst.FpuRRR` instructions. ;; Helper for emitting `MInst.FpuRRR` instructions.
@@ -2198,10 +2207,7 @@
(decl bsl (Type Reg Reg Reg) Reg) (decl bsl (Type Reg Reg Reg) Reg)
(rule (bsl ty c x y) (rule (bsl ty c x y)
(let ((dst WritableReg (temp_writable_reg ty)) (vec_rrr_mod (VecALUModOp.Bsl) c x y (vector_size ty)))
(_ Unit (emit (MInst.FpuMove128 dst c)))
(_ Unit (emit (MInst.VecRRR (VecALUOp.Bsl) dst x y (vector_size ty)))))
dst))
;; Helper for generating a `udf` instruction. ;; Helper for generating a `udf` instruction.

View File

@@ -752,6 +752,16 @@ impl VectorSize {
(q, size) (q, size)
} }
/// Return the encoding bit that is used by some floating-point SIMD
/// instructions for a particular operand size.
pub fn enc_float_size(&self) -> u32 {
match self.lane_size() {
ScalarSize::Size32 => 0b0,
ScalarSize::Size64 => 0b1,
size => panic!("Unsupported floating-point size for vector op: {:?}", size),
}
}
} }
pub(crate) fn dynamic_to_fixed(ty: Type) -> Type { pub(crate) fn dynamic_to_fixed(ty: Type) -> Type {

View File

@@ -2543,17 +2543,9 @@ impl MachInstEmit for Inst {
| VecALUOp::Fdiv | VecALUOp::Fdiv
| VecALUOp::Fmax | VecALUOp::Fmax
| VecALUOp::Fmin | VecALUOp::Fmin
| VecALUOp::Fmul | VecALUOp::Fmul => true,
| VecALUOp::Fmla => true,
_ => false, _ => false,
}; };
let enc_float_size = match (is_float, size) {
(true, VectorSize::Size32x2) => 0b0,
(true, VectorSize::Size32x4) => 0b0,
(true, VectorSize::Size64x2) => 0b1,
(true, _) => unimplemented!(),
_ => 0,
};
let (top11, bit15_10) = match alu_op { let (top11, bit15_10) = match alu_op {
VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011), VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
@@ -2574,7 +2566,6 @@ impl MachInstEmit for Inst {
VecALUOp::Bic => (0b000_01110_01_1, 0b000111), VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
VecALUOp::Orr => (0b000_01110_10_1, 0b000111), VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
VecALUOp::Eor => (0b001_01110_00_1, 0b000111), VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
VecALUOp::Umaxp => { VecALUOp::Umaxp => {
debug_assert_ne!(size, VectorSize::Size64x2); debug_assert_ne!(size, VectorSize::Size64x2);
@@ -2619,7 +2610,6 @@ impl MachInstEmit for Inst {
VecALUOp::Fmax => (0b000_01110_00_1, 0b111101), VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101), VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111), VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
VecALUOp::Fmla => (0b000_01110_00_1, 0b110011),
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111), VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110), VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
VecALUOp::Sqrdmulh => { VecALUOp::Sqrdmulh => {
@@ -2632,12 +2622,32 @@ impl MachInstEmit for Inst {
} }
}; };
let top11 = if is_float { let top11 = if is_float {
top11 | enc_float_size << 1 top11 | size.enc_float_size() << 1
} else { } else {
top11 top11
}; };
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd)); sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
} }
&Inst::VecRRRMod {
rd,
rn,
rm,
alu_op,
size,
} => {
let rd = allocs.next_writable(rd);
let rn = allocs.next(rn);
let rm = allocs.next(rm);
let (q, _enc_size) = size.enc_size();
let (top11, bit15_10) = match alu_op {
VecALUModOp::Bsl => (0b001_01110_01_1, 0b000111),
VecALUModOp::Fmla => {
(0b000_01110_00_1 | (size.enc_float_size() << 1), 0b110011)
}
};
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
}
&Inst::VecLoadReplicate { &Inst::VecLoadReplicate {
rd, rd,
rn, rn,

View File

@@ -3383,8 +3383,8 @@ fn test_aarch64_binemit() {
)); ));
insns.push(( insns.push((
Inst::VecRRR { Inst::VecRRRMod {
alu_op: VecALUOp::Bsl, alu_op: VecALUModOp::Bsl,
rd: writable_vreg(8), rd: writable_vreg(8),
rn: vreg(9), rn: vreg(9),
rm: vreg(1), rm: vreg(1),
@@ -4055,8 +4055,8 @@ fn test_aarch64_binemit() {
)); ));
insns.push(( insns.push((
Inst::VecRRR { Inst::VecRRRMod {
alu_op: VecALUOp::Fmla, alu_op: VecALUModOp::Fmla,
rd: writable_vreg(2), rd: writable_vreg(2),
rn: vreg(0), rn: vreg(0),
rm: vreg(5), rm: vreg(5),
@@ -4067,8 +4067,8 @@ fn test_aarch64_binemit() {
)); ));
insns.push(( insns.push((
Inst::VecRRR { Inst::VecRRRMod {
alu_op: VecALUOp::Fmla, alu_op: VecALUModOp::Fmla,
rd: writable_vreg(2), rd: writable_vreg(2),
rn: vreg(0), rn: vreg(0),
rm: vreg(5), rm: vreg(5),
@@ -4079,8 +4079,8 @@ fn test_aarch64_binemit() {
)); ));
insns.push(( insns.push((
Inst::VecRRR { Inst::VecRRRMod {
alu_op: VecALUOp::Fmla, alu_op: VecALUModOp::Fmla,
rd: writable_vreg(2), rd: writable_vreg(2),
rn: vreg(0), rn: vreg(0),
rm: vreg(5), rm: vreg(5),

View File

@@ -37,9 +37,9 @@ mod emit_tests;
pub use crate::isa::aarch64::lower::isle::generated_code::{ pub use crate::isa::aarch64::lower::isle::generated_code::{
ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3, ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3,
FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUOp, VecExtendOp, FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUModOp, VecALUOp,
VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp, VecRRRLongOp, VecExtendOp, VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp,
VecShiftImmOp, VecRRRLongOp, VecShiftImmOp,
}; };
/// A floating-point unit (FPU) operation with two args, a register and an immediate. /// A floating-point unit (FPU) operation with two args, a register and an immediate.
@@ -957,14 +957,13 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
collector.reg_def(rd); collector.reg_def(rd);
collector.reg_use(rn); collector.reg_use(rn);
} }
&Inst::VecRRR { &Inst::VecRRR { rd, rn, rm, .. } => {
alu_op, rd, rn, rm, .. collector.reg_def(rd);
} => { collector.reg_use(rn);
if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Fmla { collector.reg_use(rm);
collector.reg_mod(rd); }
} else { &Inst::VecRRRMod { rd, rn, rm, .. } => {
collector.reg_def(rd); collector.reg_mod(rd);
}
collector.reg_use(rn); collector.reg_use(rn);
collector.reg_use(rm); collector.reg_use(rm);
} }
@@ -2208,7 +2207,6 @@ impl Inst {
VecALUOp::Bic => ("bic", VectorSize::Size8x16), VecALUOp::Bic => ("bic", VectorSize::Size8x16),
VecALUOp::Orr => ("orr", VectorSize::Size8x16), VecALUOp::Orr => ("orr", VectorSize::Size8x16),
VecALUOp::Eor => ("eor", VectorSize::Size8x16), VecALUOp::Eor => ("eor", VectorSize::Size8x16),
VecALUOp::Bsl => ("bsl", VectorSize::Size8x16),
VecALUOp::Umaxp => ("umaxp", size), VecALUOp::Umaxp => ("umaxp", size),
VecALUOp::Add => ("add", size), VecALUOp::Add => ("add", size),
VecALUOp::Sub => ("sub", size), VecALUOp::Sub => ("sub", size),
@@ -2226,7 +2224,6 @@ impl Inst {
VecALUOp::Fmax => ("fmax", size), VecALUOp::Fmax => ("fmax", size),
VecALUOp::Fmin => ("fmin", size), VecALUOp::Fmin => ("fmin", size),
VecALUOp::Fmul => ("fmul", size), VecALUOp::Fmul => ("fmul", size),
VecALUOp::Fmla => ("fmla", size),
VecALUOp::Addp => ("addp", size), VecALUOp::Addp => ("addp", size),
VecALUOp::Zip1 => ("zip1", size), VecALUOp::Zip1 => ("zip1", size),
VecALUOp::Sqrdmulh => ("sqrdmulh", size), VecALUOp::Sqrdmulh => ("sqrdmulh", size),
@@ -2236,6 +2233,22 @@ impl Inst {
let rm = pretty_print_vreg_vector(rm, size, allocs); let rm = pretty_print_vreg_vector(rm, size, allocs);
format!("{} {}, {}, {}", op, rd, rn, rm) format!("{} {}, {}, {}", op, rd, rn, rm)
} }
&Inst::VecRRRMod {
rd,
rn,
rm,
alu_op,
size,
} => {
let (op, size) = match alu_op {
VecALUModOp::Bsl => ("bsl", VectorSize::Size8x16),
VecALUModOp::Fmla => ("fmla", size),
};
let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
let rn = pretty_print_vreg_vector(rn, size, allocs);
let rm = pretty_print_vreg_vector(rm, size, allocs);
format!("{} {}, {}, {}", op, rd, rn, rm)
}
&Inst::VecRRRLong { &Inst::VecRRRLong {
rd, rd,
rn, rn,

View File

@@ -380,7 +380,7 @@
;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (fma x y z))) (rule (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
(vec_rrr_inplace (VecALUOp.Fmla) z x y (vector_size ty))) (vec_rrr_mod (VecALUModOp.Fmla) z x y (vector_size ty)))
(rule (lower (has_type (ty_scalar_float ty) (fma x y z))) (rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
(fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z)) (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))