Split Fmla and Bsl out into new VecRRRMod op (#4638)

Separates the following opcodes for AArch64 into a separate `VecALUModOp` enum, which is emitted via the `VecRRRMod` instruction. This separates vector ALU instructions which modify a register from instructions which write to a new register: - `Bsl` - `Fmla` Addresses [a discussion](https://github.com/bytecodealliance/wasmtime/pull/4608#discussion_r937975581) in #4608. Copyright (c) 2022 Arm Limited
2022-08-08 19:33:13 +01:00
parent 866ec46613
commit 47a67d752b
6 changed files with 88 additions and 49 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -576,6 +576,14 @@
        (rm Reg)
        (size VectorSize))

+       ;; A vector ALU op modifying a source register.
+       (VecRRRMod
+        (alu_op VecALUModOp)
+        (rd WritableReg)
+        (rn Reg)
+        (rm Reg)
+        (size VectorSize))
+
       ;; Vector two register miscellaneous instruction.
       (VecMisc
        (op VecMisc2)
@@ -1108,10 +1116,6 @@
    (Orr)
    ;; Bitwise exclusive or
    (Eor)
-    ;; Bitwise select
-    ;; This opcode should only be used with the `vec_rrr_inplace`
-    ;; constructor.
-    (Bsl)
    ;; Unsigned maximum pairwise
    (Umaxp)
    ;; Add
@@ -1146,10 +1150,6 @@
    (Fmin)
    ;; Floating-point multiply
    (Fmul)
-    ;; Floating-point fused multiply-add vectors
-    ;; This opcode should only be used with the `vec_rrr_inplace`
-    ;; constructor.
-    (Fmla)
    ;; Add pairwise
    (Addp)
    ;; Zip vectors (primary) [meaning, high halves]
@@ -1158,6 +1158,15 @@
    (Sqrdmulh)
 ))

+;; A Vector ALU operation which modifies a source register.
+(type VecALUModOp
+  (enum
+    ;; Bitwise select
+    (Bsl)
+    ;; Floating-point fused multiply-add vectors
+    (Fmla)
+))
+
 ;; A Vector miscellaneous operation with two registers.
 (type VecMisc2
  (enum
@@ -1508,11 +1517,11 @@

 ;; Helper for emitting `MInst.VecRRR` instructions which use three registers,
 ;; one of which is both source and output.
-(decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg)
-(rule (vec_rrr_inplace op src1 src2 src3 size)
+(decl vec_rrr_mod (VecALUModOp Reg Reg Reg VectorSize) Reg)
+(rule (vec_rrr_mod op src1 src2 src3 size)
      (let ((dst WritableReg (temp_writable_reg $I8X16))
            (_1 Unit (emit (MInst.FpuMove128 dst src1)))
-            (_2 Unit (emit (MInst.VecRRR op dst src2 src3 size))))
+            (_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size))))
        dst))

 ;; Helper for emitting `MInst.FpuRRR` instructions.
@@ -2198,10 +2207,7 @@

 (decl bsl (Type Reg Reg Reg) Reg)
 (rule (bsl ty c x y)
-      (let ((dst WritableReg (temp_writable_reg ty))
-            (_ Unit (emit (MInst.FpuMove128 dst c)))
-            (_ Unit (emit (MInst.VecRRR (VecALUOp.Bsl) dst x y (vector_size ty)))))
-        dst))
+      (vec_rrr_mod (VecALUModOp.Bsl) c x y (vector_size ty)))

 ;; Helper for generating a `udf` instruction.

--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -752,6 +752,16 @@ impl VectorSize {

        (q, size)
    }
+
+    /// Return the encoding bit that is used by some floating-point SIMD
+    /// instructions for a particular operand size.
+    pub fn enc_float_size(&self) -> u32 {
+        match self.lane_size() {
+            ScalarSize::Size32 => 0b0,
+            ScalarSize::Size64 => 0b1,
+            size => panic!("Unsupported floating-point size for vector op: {:?}", size),
+        }
+    }
 }

 pub(crate) fn dynamic_to_fixed(ty: Type) -> Type {
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -2543,17 +2543,9 @@ impl MachInstEmit for Inst {
                    | VecALUOp::Fdiv
                    | VecALUOp::Fmax
                    | VecALUOp::Fmin
-                    | VecALUOp::Fmul
-                    | VecALUOp::Fmla => true,
+                    | VecALUOp::Fmul => true,
                    _ => false,
                };
-                let enc_float_size = match (is_float, size) {
-                    (true, VectorSize::Size32x2) => 0b0,
-                    (true, VectorSize::Size32x4) => 0b0,
-                    (true, VectorSize::Size64x2) => 0b1,
-                    (true, _) => unimplemented!(),
-                    _ => 0,
-                };

                let (top11, bit15_10) = match alu_op {
                    VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
@@ -2574,7 +2566,6 @@ impl MachInstEmit for Inst {
                    VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
                    VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
                    VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
-                    VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
                    VecALUOp::Umaxp => {
                        debug_assert_ne!(size, VectorSize::Size64x2);

@@ -2619,7 +2610,6 @@ impl MachInstEmit for Inst {
                    VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
                    VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
                    VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
-                    VecALUOp::Fmla => (0b000_01110_00_1, 0b110011),
                    VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
                    VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
                    VecALUOp::Sqrdmulh => {
@@ -2632,12 +2622,32 @@ impl MachInstEmit for Inst {
                    }
                };
                let top11 = if is_float {
-                    top11 | enc_float_size << 1
+                    top11 | size.enc_float_size() << 1
                } else {
                    top11
                };
                sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
            }
+            &Inst::VecRRRMod {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                size,
+            } => {
+                let rd = allocs.next_writable(rd);
+                let rn = allocs.next(rn);
+                let rm = allocs.next(rm);
+                let (q, _enc_size) = size.enc_size();
+
+                let (top11, bit15_10) = match alu_op {
+                    VecALUModOp::Bsl => (0b001_01110_01_1, 0b000111),
+                    VecALUModOp::Fmla => {
+                        (0b000_01110_00_1 | (size.enc_float_size() << 1), 0b110011)
+                    }
+                };
+                sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
+            }
            &Inst::VecLoadReplicate {
                rd,
                rn,
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -3383,8 +3383,8 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Bsl,
+        Inst::VecRRRMod {
+            alu_op: VecALUModOp::Bsl,
            rd: writable_vreg(8),
            rn: vreg(9),
            rm: vreg(1),
@@ -4055,8 +4055,8 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Fmla,
+        Inst::VecRRRMod {
+            alu_op: VecALUModOp::Fmla,
            rd: writable_vreg(2),
            rn: vreg(0),
            rm: vreg(5),
@@ -4067,8 +4067,8 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Fmla,
+        Inst::VecRRRMod {
+            alu_op: VecALUModOp::Fmla,
            rd: writable_vreg(2),
            rn: vreg(0),
            rm: vreg(5),
@@ -4079,8 +4079,8 @@ fn test_aarch64_binemit() {
    ));

    insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Fmla,
+        Inst::VecRRRMod {
+            alu_op: VecALUModOp::Fmla,
            rd: writable_vreg(2),
            rn: vreg(0),
            rm: vreg(5),
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -37,9 +37,9 @@ mod emit_tests;

 pub use crate::isa::aarch64::lower::isle::generated_code::{
    ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3,
-    FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUOp, VecExtendOp,
-    VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp, VecRRRLongOp,
-    VecShiftImmOp,
+    FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUModOp, VecALUOp,
+    VecExtendOp, VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp,
+    VecRRRLongOp, VecShiftImmOp,
 };

 /// A floating-point unit (FPU) operation with two args, a register and an immediate.
@@ -957,14 +957,13 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
            collector.reg_def(rd);
            collector.reg_use(rn);
        }
-        &Inst::VecRRR {
-            alu_op, rd, rn, rm, ..
-        } => {
-            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Fmla {
-                collector.reg_mod(rd);
-            } else {
+        &Inst::VecRRR { rd, rn, rm, .. } => {
            collector.reg_def(rd);
+            collector.reg_use(rn);
+            collector.reg_use(rm);
        }
+        &Inst::VecRRRMod { rd, rn, rm, .. } => {
+            collector.reg_mod(rd);
            collector.reg_use(rn);
            collector.reg_use(rm);
        }
@@ -2208,7 +2207,6 @@ impl Inst {
                    VecALUOp::Bic => ("bic", VectorSize::Size8x16),
                    VecALUOp::Orr => ("orr", VectorSize::Size8x16),
                    VecALUOp::Eor => ("eor", VectorSize::Size8x16),
-                    VecALUOp::Bsl => ("bsl", VectorSize::Size8x16),
                    VecALUOp::Umaxp => ("umaxp", size),
                    VecALUOp::Add => ("add", size),
                    VecALUOp::Sub => ("sub", size),
@@ -2226,7 +2224,6 @@ impl Inst {
                    VecALUOp::Fmax => ("fmax", size),
                    VecALUOp::Fmin => ("fmin", size),
                    VecALUOp::Fmul => ("fmul", size),
-                    VecALUOp::Fmla => ("fmla", size),
                    VecALUOp::Addp => ("addp", size),
                    VecALUOp::Zip1 => ("zip1", size),
                    VecALUOp::Sqrdmulh => ("sqrdmulh", size),
@@ -2236,6 +2233,22 @@ impl Inst {
                let rm = pretty_print_vreg_vector(rm, size, allocs);
                format!("{} {}, {}, {}", op, rd, rn, rm)
            }
+            &Inst::VecRRRMod {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                size,
+            } => {
+                let (op, size) = match alu_op {
+                    VecALUModOp::Bsl => ("bsl", VectorSize::Size8x16),
+                    VecALUModOp::Fmla => ("fmla", size),
+                };
+                let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
+                let rn = pretty_print_vreg_vector(rn, size, allocs);
+                let rm = pretty_print_vreg_vector(rm, size, allocs);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
            &Inst::VecRRRLong {
                rd,
                rn,
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -380,7 +380,7 @@
 ;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
-      (vec_rrr_inplace (VecALUOp.Fmla) z x y (vector_size ty)))
+      (vec_rrr_mod (VecALUModOp.Fmla) z x y (vector_size ty)))

 (rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
      (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))