From 00ac18c8664ddc610bbeed875fb0c0e75b525516 Mon Sep 17 00:00:00 2001 From: Damian Heaton <87125748+dheaton-arm@users.noreply.github.com> Date: Tue, 19 Jul 2022 20:03:05 +0100 Subject: [PATCH] Convert `fadd`..`fmax_pseudo` to ISLE (AArch64) (#4452) Converted the existing implementations for the following Opcodes to ISLE on AArch64: - `fadd` - `fsub` - `fmul` - `fdiv` - `fmin` - `fmax` - `fmin_pseudo` - `fmax_pseudo` Copyright (c) 2022 Arm Limited --- cranelift/codegen/src/isa/aarch64/inst.isle | 39 +++++ cranelift/codegen/src/isa/aarch64/lower.isle | 66 ++++++++ .../src/isa/aarch64/lower_dynamic_neon.isle | 28 ++++ .../codegen/src/isa/aarch64/lower_inst.rs | 104 +----------- .../isa/aarch64/dynamic-simd-neon.clif | 104 ++++++++++++ .../filetests/isa/aarch64/prologue.clif | 104 ++++++------ .../runtests/dynamic-simd-arithmetic.clif | 156 ++++++++++++++++++ 7 files changed, 447 insertions(+), 154 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index a1553000a8..d19da6bf25 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -895,6 +895,16 @@ Size64 Size128)) +;; Helper for calculating the `ScalarSize` corresponding to a type +(decl scalar_size (Type) ScalarSize) +(rule (scalar_size $I8) (ScalarSize.Size8)) +(rule (scalar_size $I16) (ScalarSize.Size16)) +(rule (scalar_size $I32) (ScalarSize.Size32)) +(rule (scalar_size $I64) (ScalarSize.Size64)) +(rule (scalar_size $I128) (ScalarSize.Size128)) +(rule (scalar_size $F32) (ScalarSize.Size32)) +(rule (scalar_size $F64) (ScalarSize.Size64)) + (type Cond extern (enum (Eq) @@ -1460,6 +1470,19 @@ (_ Unit (emit (MInst.VecRRR op dst src1 src2 size)))) dst)) +;; Helper for emitting `MInst.FpuRRR` instructions. +(decl fpu_rrr (FPUOp2 Reg Reg ScalarSize) Reg) +(rule (fpu_rrr op src1 src2 size) + (let ((dst WritableReg (temp_writable_reg $F64)) + (_ Unit (emit (MInst.FpuRRR op size dst src1 src2)))) + dst)) + +;; Helper for emitting `MInst.FpuCmp` instructions. +(decl fpu_cmp (ScalarSize Reg Reg) ProducesFlags) +(rule (fpu_cmp size rn rm) + (ProducesFlags.ProducesFlagsSideEffect + (MInst.FpuCmp size rn rm))) + ;; Helper for emitting `MInst.VecLanes` instructions. (decl vec_lanes (VecLanesOp Reg VectorSize) Reg) (rule (vec_lanes op src size) @@ -1612,6 +1635,22 @@ (_ Unit (emit (MInst.VecRRLong op dst src high_half)))) dst)) +;; Helper for emitting `MInst.FpuCSel32` / `MInst.FpuCSel64` +;; instructions. +(decl fpu_csel (Type Cond Reg Reg) ConsumesFlags) +(rule (fpu_csel $F32 cond if_true if_false) + (let ((dst WritableReg (temp_writable_reg $F32))) + (ConsumesFlags.ConsumesFlagsReturnsReg + (MInst.FpuCSel32 dst if_true if_false cond) + dst))) + +(rule (fpu_csel $F64 cond if_true if_false) + (let ((dst WritableReg (temp_writable_reg $F64))) + (ConsumesFlags.ConsumesFlagsReturnsReg + (MInst.FpuCSel64 dst if_true if_false cond) + dst))) + + ;; Helper for emitting `MInst.MovToFpu` instructions. (decl mov_to_fpu (Reg ScalarSize) Reg) (rule (mov_to_fpu x size) diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 772b3ac482..c4332418e6 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -164,6 +164,72 @@ (rule (lower (has_type (fits_in_32 ty) (iabs x))) (abs (OperandSize.Size32) (put_in_reg_sext32 x))) +;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (fadd rn rm))) + (vec_rrr (VecALUOp.Fadd) rn rm (vector_size ty))) + +(rule (lower (has_type (ty_scalar_float ty) (fadd rn rm))) + (fpu_rrr (FPUOp2.Add) rn rm (scalar_size ty))) + +;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (fsub rn rm))) + (vec_rrr (VecALUOp.Fsub) rn rm (vector_size ty))) + +(rule (lower (has_type (ty_scalar_float ty) (fsub rn rm))) + (fpu_rrr (FPUOp2.Sub) rn rm (scalar_size ty))) + +;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (fmul rn rm))) + (vec_rrr (VecALUOp.Fmul) rn rm (vector_size ty))) + +(rule (lower (has_type (ty_scalar_float ty) (fmul rn rm))) + (fpu_rrr (FPUOp2.Mul) rn rm (scalar_size ty))) + +;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (fdiv rn rm))) + (vec_rrr (VecALUOp.Fdiv) rn rm (vector_size ty))) + +(rule (lower (has_type (ty_scalar_float ty) (fdiv rn rm))) + (fpu_rrr (FPUOp2.Div) rn rm (scalar_size ty))) + +;;;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (fmin rn rm))) + (vec_rrr (VecALUOp.Fmin) rn rm (vector_size ty))) + +(rule (lower (has_type (ty_scalar_float ty) (fmin rn rm))) + (fpu_rrr (FPUOp2.Min) rn rm (scalar_size ty))) + +;;;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (fmax rn rm))) + (vec_rrr (VecALUOp.Fmax) rn rm (vector_size ty))) + +(rule (lower (has_type (ty_scalar_float ty) (fmax rn rm))) + (fpu_rrr (FPUOp2.Max) rn rm (scalar_size ty))) + +;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (fmin_pseudo rm rn))) + (bsl ty (vec_rrr (VecALUOp.Fcmgt) rm rn (vector_size ty)) rn rm)) + +(rule (lower (has_type (ty_scalar_float ty) (fmin_pseudo rm rn))) + (with_flags (fpu_cmp (scalar_size ty) rm rn) + (fpu_csel ty (Cond.Gt) rn rm))) + +;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane _ _) (fmax_pseudo rm rn))) + (bsl ty (vec_rrr (VecALUOp.Fcmgt) rn rm (vector_size ty)) rn rm)) + +(rule (lower (has_type (ty_scalar_float ty) (fmax_pseudo rm rn))) + (with_flags (fpu_cmp (scalar_size ty) rn rm) + (fpu_csel ty (Cond.Gt) rn rm))) + ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller diff --git a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle index c6f6115af9..12d20b3e3d 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle +++ b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle @@ -19,6 +19,34 @@ (rule (lower (has_type ty @ (dynamic_lane _ _) (fsub x y))) (value_reg (vec_rrr (VecALUOp.Fsub) (put_in_reg x) (put_in_reg y) (vector_size ty)))) +;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type ty @ (dynamic_lane _ _) (fmul x y))) + (value_reg (vec_rrr (VecALUOp.Fmul) (put_in_reg x) (put_in_reg y) (vector_size ty)))) + +;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type ty @ (dynamic_lane _ _) (fdiv x y))) + (value_reg (vec_rrr (VecALUOp.Fdiv) (put_in_reg x) (put_in_reg y) (vector_size ty)))) + +;;;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type ty @ (dynamic_lane _ _) (fmin x y))) + (value_reg (vec_rrr (VecALUOp.Fmin) (put_in_reg x) (put_in_reg y) (vector_size ty)))) + +;;;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type ty @ (dynamic_lane _ _) (fmax x y))) + (value_reg (vec_rrr (VecALUOp.Fmax) (put_in_reg x) (put_in_reg y) (vector_size ty)))) + +;;;; Rules for `fmin_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type ty @ (dynamic_lane _ _) (fmin_pseudo x y))) + (value_reg (bsl ty + (vec_rrr (VecALUOp.Fcmgt) (put_in_reg x) (put_in_reg y) + (vector_size ty)) (put_in_reg y) (put_in_reg x)))) + +;;;; Rules for `fmax_pseudo` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +(rule (lower (has_type ty @ (dynamic_lane _ _) (fmax_pseudo x y))) + (value_reg (bsl ty + (vec_rrr (VecALUOp.Fcmgt) (put_in_reg y) (put_in_reg x) + (vector_size ty)) (put_in_reg y) (put_in_reg x)))) + ;;; Rules for `dynamic_stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (dynamic_stack_addr stack_slot)) (let ((dst WritableReg (temp_writable_reg $I64)) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 09e4d311c4..f01221cb45 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1244,110 +1244,10 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => { - let ty = ty.unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - if !ty.is_vector() && !ty.is_dynamic_vector() { - let fpu_op = match op { - Opcode::Fadd => FPUOp2::Add, - Opcode::Fsub => FPUOp2::Sub, - Opcode::Fmul => FPUOp2::Mul, - Opcode::Fdiv => FPUOp2::Div, - Opcode::Fmin => FPUOp2::Min, - Opcode::Fmax => FPUOp2::Max, - _ => unreachable!(), - }; - ctx.emit(Inst::FpuRRR { - fpu_op, - size: ScalarSize::from_ty(ty), - rd, - rn, - rm, - }); - } else { - let alu_op = match op { - Opcode::Fadd => VecALUOp::Fadd, - Opcode::Fsub => VecALUOp::Fsub, - Opcode::Fdiv => VecALUOp::Fdiv, - Opcode::Fmax => VecALUOp::Fmax, - Opcode::Fmin => VecALUOp::Fmin, - Opcode::Fmul => VecALUOp::Fmul, - _ => unreachable!(), - }; - - ctx.emit(Inst::VecRRR { - rd, - rn, - rm, - alu_op, - size: VectorSize::from_ty(ty), - }); - } + implemented_in_isle(ctx) } - Opcode::FminPseudo | Opcode::FmaxPseudo => { - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - let (ra, rb) = if op == Opcode::FminPseudo { - (rm, rn) - } else { - (rn, rm) - }; - let ty = ty.unwrap(); - let lane_type = ty.lane_type(); - - debug_assert!(lane_type == F32 || lane_type == F64); - - if ty.is_vector() || ty.is_dynamic_vector() { - let size = VectorSize::from_ty(ty); - - // pmin(a,b) => bitsel(b, a, cmpgt(a, b)) - // pmax(a,b) => bitsel(b, a, cmpgt(b, a)) - // Since we're going to write the output register `rd` anyway, we might as well - // first use it to hold the comparison result. This has the slightly unusual - // effect that we modify the output register in the first instruction (`fcmgt`) - // but read both the inputs again in the second instruction (`bsl`), which means - // that the output register can't be either of the input registers. Regalloc - // should handle this correctly, nevertheless. - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Fcmgt, - rd, - rn: ra, - rm: rb, - size, - }); - ctx.emit(Inst::VecRRR { - alu_op: VecALUOp::Bsl, - rd, - rn, - rm, - size, - }); - } else { - ctx.emit(Inst::FpuCmp { - size: ScalarSize::from_ty(lane_type), - rn: ra, - rm: rb, - }); - if lane_type == F32 { - ctx.emit(Inst::FpuCSel32 { - rd, - rn, - rm, - cond: Cond::Gt, - }); - } else { - ctx.emit(Inst::FpuCSel64 { - rd, - rn, - rm, - cond: Cond::Gt, - }); - } - } - } + Opcode::FminPseudo | Opcode::FmaxPseudo => implemented_in_isle(ctx), Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => { let ty = ty.unwrap(); diff --git a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif index 255e19bfde..1f1f64d0cf 100644 --- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif +++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif @@ -102,3 +102,107 @@ block0(v0: f64, v1: f64): ; nextln: dup v6.2d, v1.d[0] ; nextln: fsub v0.2d, v4.2d, v6.2d ; nextln: ret + +function %f64x2_splat_mul(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmul v2, v3 + v5 = extract_vector v4, 0 + return v5 +} + +; check: dup v4.2d, v0.d[0] +; nextln: dup v6.2d, v1.d[0] +; nextln: fmul v0.2d, v4.2d, v6.2d +; nextln: ret + +function %f64x2_splat_div(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fdiv v2, v3 + v5 = extract_vector v4, 0 + return v5 +} + +; check: dup v4.2d, v0.d[0] +; nextln: dup v6.2d, v1.d[0] +; nextln: fdiv v0.2d, v4.2d, v6.2d +; nextln: ret + +function %f64x2_splat_min(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmin v2, v3 + v5 = extract_vector v4, 0 + return v5 +} + +; check: dup v4.2d, v0.d[0] +; nextln: dup v6.2d, v1.d[0] +; nextln: fmin v0.2d, v4.2d, v6.2d +; nextln: ret + +function %f64x2_splat_max(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmax v2, v3 + v5 = extract_vector v4, 0 + return v5 +} + +; check: dup v4.2d, v0.d[0] +; nextln: dup v6.2d, v1.d[0] +; nextln: fmax v0.2d, v4.2d, v6.2d +; nextln: ret + +function %f64x2_splat_min_pseudo(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmin_pseudo v2, v3 + v5 = extract_vector v4, 0 + return v5 +} + +; check: dup v4.2d, v0.d[0] +; nextln: dup v6.2d, v1.d[0] +; nextln: fcmgt v0.2d, v4.2d, v6.2d +; nextln: bsl v0.16b, v6.16b, v4.16b +; nextln: ret + +function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmax_pseudo v2, v3 + v5 = extract_vector v4, 0 + return v5 +} + +; check: dup v4.2d, v0.d[0] +; nextln: dup v6.2d, v1.d[0] +; nextln: fcmgt v0.2d, v6.2d, v4.2d +; nextln: bsl v0.16b, v6.16b, v4.16b +; nextln: ret diff --git a/cranelift/filetests/filetests/isa/aarch64/prologue.clif b/cranelift/filetests/filetests/isa/aarch64/prologue.clif index 2451faa03c..9b4a52e4e0 100644 --- a/cranelift/filetests/filetests/isa/aarch64/prologue.clif +++ b/cranelift/filetests/filetests/isa/aarch64/prologue.clif @@ -82,68 +82,68 @@ block0(v0: f64): ; stp d10, d11, [sp, #-16]! ; stp d8, d9, [sp, #-16]! ; block0: -; fadd d2, d0, d0 -; fadd d4, d0, d0 -; fadd d6, d0, d0 -; fadd d8, d0, d0 -; fadd d10, d0, d0 -; fadd d12, d0, d0 -; fadd d14, d0, d0 ; fadd d1, d0, d0 +; fadd d2, d0, d0 ; fadd d3, d0, d0 +; fadd d4, d0, d0 ; fadd d5, d0, d0 +; fadd d6, d0, d0 ; fadd d7, d0, d0 +; fadd d8, d0, d0 ; fadd d9, d0, d0 +; fadd d10, d0, d0 ; fadd d11, d0, d0 +; fadd d12, d0, d0 ; fadd d13, d0, d0 -; fadd d30, d0, d0 +; fadd d14, d0, d0 ; fadd d15, d0, d0 -; fadd d18, d0, d0 -; fadd d20, d0, d0 -; fadd d22, d0, d0 -; fadd d24, d0, d0 -; fadd d26, d0, d0 -; fadd d28, d0, d0 -; fadd d31, d0, d0 ; fadd d16, d0, d0 -; fadd d19, d0, d0 -; fadd d21, d0, d0 -; fadd d23, d0, d0 -; fadd d25, d0, d0 -; fadd d27, d0, d0 -; fadd d29, d0, d0 ; fadd d17, d0, d0 -; fadd d0, d0, d2 -; fadd d2, d4, d6 -; fadd d4, d8, d10 -; fadd d6, d12, d14 -; fadd d8, d1, d3 -; fadd d10, d5, d7 -; fadd d12, d9, d11 -; fadd d14, d13, d30 -; fadd d1, d15, d18 -; fadd d3, d20, d22 -; fadd d5, d24, d26 -; fadd d7, d28, d31 -; fadd d9, d16, d19 -; fadd d11, d21, d23 -; fadd d13, d25, d27 -; fadd d15, d29, d17 -; fadd d0, d0, d2 -; fadd d2, d4, d6 -; fadd d4, d8, d10 -; fadd d6, d12, d14 -; fadd d8, d1, d3 -; fadd d10, d5, d7 -; fadd d12, d9, d11 -; fadd d14, d13, d15 -; fadd d0, d0, d2 -; fadd d2, d4, d6 -; fadd d4, d8, d10 -; fadd d6, d12, d14 -; fadd d8, d0, d2 -; fadd d10, d4, d6 -; fadd d0, d8, d10 +; fadd d18, d0, d0 +; fadd d19, d0, d0 +; fadd d20, d0, d0 +; fadd d21, d0, d0 +; fadd d22, d0, d0 +; fadd d23, d0, d0 +; fadd d24, d0, d0 +; fadd d25, d0, d0 +; fadd d26, d0, d0 +; fadd d27, d0, d0 +; fadd d28, d0, d0 +; fadd d29, d0, d0 +; fadd d30, d0, d0 +; fadd d31, d0, d0 +; fadd d0, d0, d1 +; fadd d1, d2, d3 +; fadd d2, d4, d5 +; fadd d3, d6, d7 +; fadd d4, d8, d9 +; fadd d5, d10, d11 +; fadd d6, d12, d13 +; fadd d7, d14, d15 +; fadd d8, d16, d17 +; fadd d9, d18, d19 +; fadd d10, d20, d21 +; fadd d11, d22, d23 +; fadd d12, d24, d25 +; fadd d13, d26, d27 +; fadd d14, d28, d29 +; fadd d15, d30, d31 +; fadd d0, d0, d1 +; fadd d1, d2, d3 +; fadd d2, d4, d5 +; fadd d3, d6, d7 +; fadd d4, d8, d9 +; fadd d5, d10, d11 +; fadd d6, d12, d13 +; fadd d7, d14, d15 +; fadd d0, d0, d1 +; fadd d1, d2, d3 +; fadd d2, d4, d5 +; fadd d3, d6, d7 +; fadd d0, d0, d1 +; fadd d1, d2, d3 +; fadd d0, d0, d1 ; ldp d8, d9, [sp], #16 ; ldp d10, d11, [sp], #16 ; ldp d12, d13, [sp], #16 diff --git a/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif index de7dcdc79f..0bd30a105d 100644 --- a/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif @@ -195,3 +195,159 @@ block0(v0: f64, v1: f64): return v5 } ; run: %f64x2_splat_sub(0x1.0, 0x3.0) == [-0x2.0 -0x2.0] + +function %f32x4_splat_mul(f32, f32) -> f32x4 { + gv0 = dyn_scale_target_const.f32x4 + dt0 = f32x4*gv0 + +block0(v0: f32, v1: f32): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmul v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f32x4_splat_mul(0x2.0, 0x3.0) == [0x6.0 0x6.0 0x6.0 0x6.0] + +function %f64x2_splat_mul(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmul v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f64x2_splat_sub(-0x2.0, 0x3.0) == [-0x6.0 -0x6.0] + +function %f32x4_splat_div(f32, f32) -> f32x4 { + gv0 = dyn_scale_target_const.f32x4 + dt0 = f32x4*gv0 + +block0(v0: f32, v1: f32): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fdiv v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f32x4_splat_div(0x6.6, 0x2.2) == [0x3.0 0x3.0 0x3.0 0x3.0] + +function %f64x2_splat_div(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fdiv v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f64x2_splat_div(-0x6.6, 0x2.2) == [-0x3.0 -0x3.0] + +function %f32x4_splat_min(f32, f32) -> f32x4 { + gv0 = dyn_scale_target_const.f32x4 + dt0 = f32x4*gv0 + +block0(v0: f32, v1: f32): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmin v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f32x4_splat_min(0x6.6, 0x2.2) == [0x2.2 0x2.2 0x2.2 0x2.2] + +function %f64x2_splat_min(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmin v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f64x2_splat_min(-0x6.6, 0x2.2) == [-0x6.6 -0x6.6] + +function %f32x4_splat_max(f32, f32) -> f32x4 { + gv0 = dyn_scale_target_const.f32x4 + dt0 = f32x4*gv0 + +block0(v0: f32, v1: f32): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmax v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f32x4_splat_max(0x6.6, 0x2.2) == [0x6.6 0x6.6 0x6.6 0x6.6] + +function %f64x2_splat_max(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmax v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f64x2_splat_max(-0x6.6, 0x2.2) == [0x2.2 0x2.2] + +function %f32x4_splat_min_pseudo(f32, f32) -> f32x4 { + gv0 = dyn_scale_target_const.f32x4 + dt0 = f32x4*gv0 + +block0(v0: f32, v1: f32): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmin_pseudo v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f32x4_splat_min_pseudo(0x6.6, 0x2.2) == [0x2.2 0x2.2 0x2.2 0x2.2] + +function %f64x2_splat_min_pseudo(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmin_pseudo v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f64x2_splat_min_pseudo(-0x6.6, 0x2.2) == [-0x6.6 -0x6.6] + +function %f32x4_splat_max_pseudo(f32, f32) -> f32x4 { + gv0 = dyn_scale_target_const.f32x4 + dt0 = f32x4*gv0 + +block0(v0: f32, v1: f32): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmax_pseudo v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f32x4_splat_max_pseudo(0x6.6, 0x2.2) == [0x6.6 0x6.6 0x6.6 0x6.6] + +function %f64x2_splat_max_pseudo(f64, f64) -> f64x2 { + gv0 = dyn_scale_target_const.f64x2 + dt0 = f64x2*gv0 + +block0(v0: f64, v1: f64): + v2 = splat.dt0 v0 + v3 = splat.dt0 v1 + v4 = fmax_pseudo v2, v3 + v5 = extract_vector v4, 0 + return v5 +} +; run: %f64x2_splat_max_pseudo(-0x6.6, 0x2.2) == [0x2.2 0x2.2]