diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index b8bf2ef480..53b0e65eda 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -852,7 +852,7 @@ (rd WritableReg) ;; Offset in range -2^20 .. 2^20. (off i32)) - + ;; Compute the address (using a PC-relative offset) of a 4KB page. (Adrp (rd WritableReg) @@ -1401,6 +1401,8 @@ (Bsl) ;; Floating-point fused multiply-add vectors (Fmla) + ;; Floating-point fused multiply-subtract vectors + (Fmls) )) ;; A Vector miscellaneous operation with two registers. diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 90a5dbd936..72b1e1176b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -2906,6 +2906,9 @@ impl MachInstEmit for Inst { VecALUModOp::Fmla => { (0b000_01110_00_1 | (size.enc_float_size() << 1), 0b110011) } + VecALUModOp::Fmls => { + (0b000_01110_10_1 | (size.enc_float_size() << 1), 0b110011) + } }; sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd)); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 1826603356..779782907f 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -2363,6 +2363,7 @@ impl Inst { let (op, size) = match alu_op { VecALUModOp::Bsl => ("bsl", VectorSize::Size8x16), VecALUModOp::Fmla => ("fmla", size), + VecALUModOp::Fmls => ("fmls", size), }; let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs); let ri = pretty_print_vreg_vector(ri, size, allocs); diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 3eec28c118..bd7e968d72 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -404,7 +404,13 @@ (rule (lower (has_type ty @ (multi_lane _ _) (fma x y z))) (vec_rrr_mod (VecALUModOp.Fmla) z x y (vector_size ty))) -(rule 1 (lower (has_type (ty_scalar_float ty) (fma x y z))) +(rule 1 (lower (has_type ty @ (multi_lane _ _) (fma (fneg x) y z))) + (vec_rrr_mod (VecALUModOp.Fmls) z x y (vector_size ty))) + +(rule 2 (lower (has_type ty @ (multi_lane _ _) (fma x (fneg y) z))) + (vec_rrr_mod (VecALUModOp.Fmls) z x y (vector_size ty))) + +(rule 3 (lower (has_type (ty_scalar_float ty) (fma x y z))) (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z)) ;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/isa/aarch64/fma.clif b/cranelift/filetests/filetests/isa/aarch64/fma.clif new file mode 100644 index 0000000000..e2f4a172c4 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/fma.clif @@ -0,0 +1,159 @@ +test compile precise-output +target aarch64 + +function %fma_f32(f32, f32, f32) -> f32 { +block0(v0: f32, v1: f32, v2: f32): + v3 = fma v0, v1, v2 + return v3 +} + +; VCode: +; block0: +; fmadd s0, s0, s1, s2 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; fmadd s0, s0, s1, s2 +; ret + +function %fma_f64(f64, f64, f64) -> f64 { +block0(v0: f64, v1: f64, v2: f64): + v3 = fma v0, v1, v2 + return v3 +} + +; VCode: +; block0: +; fmadd d0, d0, d1, d2 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; fmadd d0, d0, d1, d2 +; ret + +function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fma v0, v1, v2 + return v3 +} + +; VCode: +; block0: +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmla v0.4s, v0.4s, v5.4s, v1.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmla v0.4s, v5.4s, v1.4s +; ret + +function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = fma v0, v1, v2 + return v3 +} + +; VCode: +; block0: +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmla v0.2d, v0.2d, v5.2d, v1.2d +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmla v0.2d, v5.2d, v1.2d +; ret + +function %fma_neg_f32x4(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fneg v0 + v4 = fma v3, v1, v2 + return v4 +} + +; VCode: +; block0: +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmls v0.4s, v0.4s, v5.4s, v1.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmls v0.4s, v5.4s, v1.4s +; ret + +function %fma_neg_f64x2(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = fneg v0 + v4 = fma v3, v1, v2 + return v4 +} + +; VCode: +; block0: +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmls v0.2d, v0.2d, v5.2d, v1.2d +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmls v0.2d, v5.2d, v1.2d +; ret + +function %fma_neg_other_f32x4(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fneg v1 + v4 = fma v0, v3, v2 + return v4 +} + +; VCode: +; block0: +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmls v0.4s, v0.4s, v5.4s, v1.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmls v0.4s, v5.4s, v1.4s +; ret + +function %fma_neg_other_f64x2(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = fneg v1 + v4 = fma v0, v3, v2 + return v4 +} + +; VCode: +; block0: +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmls v0.2d, v0.2d, v5.2d, v1.2d +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov v5.16b, v0.16b +; mov v0.16b, v2.16b +; fmls v0.2d, v5.2d, v1.2d +; ret +