CL/aarch64: implement the wasm SIMD i32x4.dot_i16x8_s instruction

This patch implements, for aarch64, the following wasm SIMD extensions i32x4.dot_i16x8_s instruction https://github.com/WebAssembly/simd/pull/127 It also updates dependencies as follows, in order that the new instruction can be parsed, decoded, etc: wat to 1.0.27 wast to 26.0.1 wasmparser to 0.65.0 wasmprinter to 0.2.12 The changes are straightforward: * new CLIF instruction `widening_pairwise_dot_product_s` * translation from wasm into `widening_pairwise_dot_product_s` * new AArch64 instructions `smull`, `smull2` (part of the `VecRRR` group) * translation from `widening_pairwise_dot_product_s` to `smull ; smull2 ; addv` There is no testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
2020-10-27 15:04:32 +01:00
parent 54a97f784e
commit 5a5fb11979
26 changed files with 228 additions and 54 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2375,6 +2375,47 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            });
        }

+        Opcode::WideningPairwiseDotProductS => {
+            let r_y = get_output_reg(ctx, outputs[0]);
+            let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            if ty == I32X4 {
+                let tmp = ctx.alloc_tmp(RegClass::V128, I8X16);
+                // The args have type I16X8.
+                // "y = i32x4.dot_i16x8_s(a, b)"
+                // => smull  tmp, a, b
+                //    smull2 y,   a, b
+                //    addp   y,   tmp, y
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Smull,
+                    rd: tmp,
+                    rn: r_a,
+                    rm: r_b,
+                    size: VectorSize::Size16x8,
+                });
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Smull2,
+                    rd: r_y,
+                    rn: r_a,
+                    rm: r_b,
+                    size: VectorSize::Size16x8,
+                });
+                ctx.emit(Inst::VecRRR {
+                    alu_op: VecALUOp::Addp,
+                    rd: r_y,
+                    rn: tmp.to_reg(),
+                    rm: r_y.to_reg(),
+                    size: VectorSize::Size32x4,
+                });
+            } else {
+                return Err(CodegenError::Unsupported(format!(
+                    "Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
+                    ty
+                )));
+            }
+        }
+
        Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
            let ty = ty.unwrap();
            let bits = ty_bits(ty);