Split Fmla and Bsl out into new VecRRRMod op (#4638)
Separates the following opcodes for AArch64 into a separate `VecALUModOp` enum, which is emitted via the `VecRRRMod` instruction. This separates vector ALU instructions which modify a register from instructions which write to a new register: - `Bsl` - `Fmla` Addresses [a discussion](https://github.com/bytecodealliance/wasmtime/pull/4608#discussion_r937975581) in #4608. Copyright (c) 2022 Arm Limited
This commit is contained in:
@@ -576,6 +576,14 @@
|
||||
(rm Reg)
|
||||
(size VectorSize))
|
||||
|
||||
;; A vector ALU op modifying a source register.
|
||||
(VecRRRMod
|
||||
(alu_op VecALUModOp)
|
||||
(rd WritableReg)
|
||||
(rn Reg)
|
||||
(rm Reg)
|
||||
(size VectorSize))
|
||||
|
||||
;; Vector two register miscellaneous instruction.
|
||||
(VecMisc
|
||||
(op VecMisc2)
|
||||
@@ -1108,10 +1116,6 @@
|
||||
(Orr)
|
||||
;; Bitwise exclusive or
|
||||
(Eor)
|
||||
;; Bitwise select
|
||||
;; This opcode should only be used with the `vec_rrr_inplace`
|
||||
;; constructor.
|
||||
(Bsl)
|
||||
;; Unsigned maximum pairwise
|
||||
(Umaxp)
|
||||
;; Add
|
||||
@@ -1146,10 +1150,6 @@
|
||||
(Fmin)
|
||||
;; Floating-point multiply
|
||||
(Fmul)
|
||||
;; Floating-point fused multiply-add vectors
|
||||
;; This opcode should only be used with the `vec_rrr_inplace`
|
||||
;; constructor.
|
||||
(Fmla)
|
||||
;; Add pairwise
|
||||
(Addp)
|
||||
;; Zip vectors (primary) [meaning, high halves]
|
||||
@@ -1158,6 +1158,15 @@
|
||||
(Sqrdmulh)
|
||||
))
|
||||
|
||||
;; A Vector ALU operation which modifies a source register.
|
||||
(type VecALUModOp
|
||||
(enum
|
||||
;; Bitwise select
|
||||
(Bsl)
|
||||
;; Floating-point fused multiply-add vectors
|
||||
(Fmla)
|
||||
))
|
||||
|
||||
;; A Vector miscellaneous operation with two registers.
|
||||
(type VecMisc2
|
||||
(enum
|
||||
@@ -1508,11 +1517,11 @@
|
||||
|
||||
;; Helper for emitting `MInst.VecRRR` instructions which use three registers,
|
||||
;; one of which is both source and output.
|
||||
(decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg)
|
||||
(rule (vec_rrr_inplace op src1 src2 src3 size)
|
||||
(decl vec_rrr_mod (VecALUModOp Reg Reg Reg VectorSize) Reg)
|
||||
(rule (vec_rrr_mod op src1 src2 src3 size)
|
||||
(let ((dst WritableReg (temp_writable_reg $I8X16))
|
||||
(_1 Unit (emit (MInst.FpuMove128 dst src1)))
|
||||
(_2 Unit (emit (MInst.VecRRR op dst src2 src3 size))))
|
||||
(_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size))))
|
||||
dst))
|
||||
|
||||
;; Helper for emitting `MInst.FpuRRR` instructions.
|
||||
@@ -2198,10 +2207,7 @@
|
||||
|
||||
(decl bsl (Type Reg Reg Reg) Reg)
|
||||
(rule (bsl ty c x y)
|
||||
(let ((dst WritableReg (temp_writable_reg ty))
|
||||
(_ Unit (emit (MInst.FpuMove128 dst c)))
|
||||
(_ Unit (emit (MInst.VecRRR (VecALUOp.Bsl) dst x y (vector_size ty)))))
|
||||
dst))
|
||||
(vec_rrr_mod (VecALUModOp.Bsl) c x y (vector_size ty)))
|
||||
|
||||
;; Helper for generating a `udf` instruction.
|
||||
|
||||
|
||||
@@ -752,6 +752,16 @@ impl VectorSize {
|
||||
|
||||
(q, size)
|
||||
}
|
||||
|
||||
/// Return the encoding bit that is used by some floating-point SIMD
|
||||
/// instructions for a particular operand size.
|
||||
pub fn enc_float_size(&self) -> u32 {
|
||||
match self.lane_size() {
|
||||
ScalarSize::Size32 => 0b0,
|
||||
ScalarSize::Size64 => 0b1,
|
||||
size => panic!("Unsupported floating-point size for vector op: {:?}", size),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dynamic_to_fixed(ty: Type) -> Type {
|
||||
|
||||
@@ -2543,17 +2543,9 @@ impl MachInstEmit for Inst {
|
||||
| VecALUOp::Fdiv
|
||||
| VecALUOp::Fmax
|
||||
| VecALUOp::Fmin
|
||||
| VecALUOp::Fmul
|
||||
| VecALUOp::Fmla => true,
|
||||
| VecALUOp::Fmul => true,
|
||||
_ => false,
|
||||
};
|
||||
let enc_float_size = match (is_float, size) {
|
||||
(true, VectorSize::Size32x2) => 0b0,
|
||||
(true, VectorSize::Size32x4) => 0b0,
|
||||
(true, VectorSize::Size64x2) => 0b1,
|
||||
(true, _) => unimplemented!(),
|
||||
_ => 0,
|
||||
};
|
||||
|
||||
let (top11, bit15_10) = match alu_op {
|
||||
VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
|
||||
@@ -2574,7 +2566,6 @@ impl MachInstEmit for Inst {
|
||||
VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
|
||||
VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
|
||||
VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
|
||||
VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
|
||||
VecALUOp::Umaxp => {
|
||||
debug_assert_ne!(size, VectorSize::Size64x2);
|
||||
|
||||
@@ -2619,7 +2610,6 @@ impl MachInstEmit for Inst {
|
||||
VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
|
||||
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
|
||||
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
|
||||
VecALUOp::Fmla => (0b000_01110_00_1, 0b110011),
|
||||
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
|
||||
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
|
||||
VecALUOp::Sqrdmulh => {
|
||||
@@ -2632,12 +2622,32 @@ impl MachInstEmit for Inst {
|
||||
}
|
||||
};
|
||||
let top11 = if is_float {
|
||||
top11 | enc_float_size << 1
|
||||
top11 | size.enc_float_size() << 1
|
||||
} else {
|
||||
top11
|
||||
};
|
||||
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
|
||||
}
|
||||
&Inst::VecRRRMod {
|
||||
rd,
|
||||
rn,
|
||||
rm,
|
||||
alu_op,
|
||||
size,
|
||||
} => {
|
||||
let rd = allocs.next_writable(rd);
|
||||
let rn = allocs.next(rn);
|
||||
let rm = allocs.next(rm);
|
||||
let (q, _enc_size) = size.enc_size();
|
||||
|
||||
let (top11, bit15_10) = match alu_op {
|
||||
VecALUModOp::Bsl => (0b001_01110_01_1, 0b000111),
|
||||
VecALUModOp::Fmla => {
|
||||
(0b000_01110_00_1 | (size.enc_float_size() << 1), 0b110011)
|
||||
}
|
||||
};
|
||||
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
|
||||
}
|
||||
&Inst::VecLoadReplicate {
|
||||
rd,
|
||||
rn,
|
||||
|
||||
@@ -3383,8 +3383,8 @@ fn test_aarch64_binemit() {
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Bsl,
|
||||
Inst::VecRRRMod {
|
||||
alu_op: VecALUModOp::Bsl,
|
||||
rd: writable_vreg(8),
|
||||
rn: vreg(9),
|
||||
rm: vreg(1),
|
||||
@@ -4055,8 +4055,8 @@ fn test_aarch64_binemit() {
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Fmla,
|
||||
Inst::VecRRRMod {
|
||||
alu_op: VecALUModOp::Fmla,
|
||||
rd: writable_vreg(2),
|
||||
rn: vreg(0),
|
||||
rm: vreg(5),
|
||||
@@ -4067,8 +4067,8 @@ fn test_aarch64_binemit() {
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Fmla,
|
||||
Inst::VecRRRMod {
|
||||
alu_op: VecALUModOp::Fmla,
|
||||
rd: writable_vreg(2),
|
||||
rn: vreg(0),
|
||||
rm: vreg(5),
|
||||
@@ -4079,8 +4079,8 @@ fn test_aarch64_binemit() {
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Fmla,
|
||||
Inst::VecRRRMod {
|
||||
alu_op: VecALUModOp::Fmla,
|
||||
rd: writable_vreg(2),
|
||||
rn: vreg(0),
|
||||
rm: vreg(5),
|
||||
|
||||
@@ -37,9 +37,9 @@ mod emit_tests;
|
||||
|
||||
pub use crate::isa::aarch64::lower::isle::generated_code::{
|
||||
ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3,
|
||||
FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUOp, VecExtendOp,
|
||||
VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp, VecRRRLongOp,
|
||||
VecShiftImmOp,
|
||||
FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUModOp, VecALUOp,
|
||||
VecExtendOp, VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp,
|
||||
VecRRRLongOp, VecShiftImmOp,
|
||||
};
|
||||
|
||||
/// A floating-point unit (FPU) operation with two args, a register and an immediate.
|
||||
@@ -957,14 +957,13 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
|
||||
collector.reg_def(rd);
|
||||
collector.reg_use(rn);
|
||||
}
|
||||
&Inst::VecRRR {
|
||||
alu_op, rd, rn, rm, ..
|
||||
} => {
|
||||
if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Fmla {
|
||||
collector.reg_mod(rd);
|
||||
} else {
|
||||
collector.reg_def(rd);
|
||||
}
|
||||
&Inst::VecRRR { rd, rn, rm, .. } => {
|
||||
collector.reg_def(rd);
|
||||
collector.reg_use(rn);
|
||||
collector.reg_use(rm);
|
||||
}
|
||||
&Inst::VecRRRMod { rd, rn, rm, .. } => {
|
||||
collector.reg_mod(rd);
|
||||
collector.reg_use(rn);
|
||||
collector.reg_use(rm);
|
||||
}
|
||||
@@ -2208,7 +2207,6 @@ impl Inst {
|
||||
VecALUOp::Bic => ("bic", VectorSize::Size8x16),
|
||||
VecALUOp::Orr => ("orr", VectorSize::Size8x16),
|
||||
VecALUOp::Eor => ("eor", VectorSize::Size8x16),
|
||||
VecALUOp::Bsl => ("bsl", VectorSize::Size8x16),
|
||||
VecALUOp::Umaxp => ("umaxp", size),
|
||||
VecALUOp::Add => ("add", size),
|
||||
VecALUOp::Sub => ("sub", size),
|
||||
@@ -2226,7 +2224,6 @@ impl Inst {
|
||||
VecALUOp::Fmax => ("fmax", size),
|
||||
VecALUOp::Fmin => ("fmin", size),
|
||||
VecALUOp::Fmul => ("fmul", size),
|
||||
VecALUOp::Fmla => ("fmla", size),
|
||||
VecALUOp::Addp => ("addp", size),
|
||||
VecALUOp::Zip1 => ("zip1", size),
|
||||
VecALUOp::Sqrdmulh => ("sqrdmulh", size),
|
||||
@@ -2236,6 +2233,22 @@ impl Inst {
|
||||
let rm = pretty_print_vreg_vector(rm, size, allocs);
|
||||
format!("{} {}, {}, {}", op, rd, rn, rm)
|
||||
}
|
||||
&Inst::VecRRRMod {
|
||||
rd,
|
||||
rn,
|
||||
rm,
|
||||
alu_op,
|
||||
size,
|
||||
} => {
|
||||
let (op, size) = match alu_op {
|
||||
VecALUModOp::Bsl => ("bsl", VectorSize::Size8x16),
|
||||
VecALUModOp::Fmla => ("fmla", size),
|
||||
};
|
||||
let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
|
||||
let rn = pretty_print_vreg_vector(rn, size, allocs);
|
||||
let rm = pretty_print_vreg_vector(rm, size, allocs);
|
||||
format!("{} {}, {}, {}", op, rd, rn, rm)
|
||||
}
|
||||
&Inst::VecRRRLong {
|
||||
rd,
|
||||
rn,
|
||||
|
||||
@@ -380,7 +380,7 @@
|
||||
;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
|
||||
(vec_rrr_inplace (VecALUOp.Fmla) z x y (vector_size ty)))
|
||||
(vec_rrr_mod (VecALUModOp.Fmla) z x y (vector_size ty)))
|
||||
|
||||
(rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
|
||||
(fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))
|
||||
|
||||
Reference in New Issue
Block a user