CL/aarch64 back end: implement the wasm SIMD bitmask instructions

The `bitmask.{8x16,16x8,32x4}` instructions do not map neatly to any single AArch64 SIMD instruction, and instead need a sequence of around ten instructions. Because of this, this patch is somewhat longer and more complex than it would be for (eg) x64. Main changes are: * the relevant testsuite test (`simd_boolean.wast`) has been enabled on aarch64. * at the CLIF level, add a new instruction `vhigh_bits`, into which these wasm instructions are to be translated. * in the wasm->CLIF translation (code_translator.rs), translate into `vhigh_bits`. This is straightforward. * in the CLIF->AArch64 translation (lower_inst.rs), translate `vhigh_bits` into equivalent sequences of AArch64 instructions. There is a different sequence for each of the `{8x16, 16x8, 32x4}` variants. All other changes are AArch64-specific, and add instruction definitions needed by the previous step: * Add two new families of AArch64 instructions: `VecShiftImm` (vector shift by immediate) and `VecExtract` (effectively a double-length vector shift) * To the existing AArch64 family `VecRRR`, add a `zip1` variant. To the `VecLanesOp` family add an `addv` variant. * Add supporting code for the above changes to AArch64 instructions: - getting the register uses (`aarch64_get_regs`) - mapping the registers (`aarch64_map_regs`) - printing instructions - emitting instructions (`impl MachInstEmit for Inst`). The handling of `VecShiftImm` is a bit complex. - emission tests for new instructions and variants.
2020-10-22 16:02:46 +02:00
parent b10e027fef
commit 2702942050
8 changed files with 570 additions and 5 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2060,6 +2060,197 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            normalize_bool_result(ctx, insn, rd);
        }

+        Opcode::VhighBits => {
+            let dst_r = get_output_reg(ctx, outputs[0]);
+            let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ctx.input_ty(insn, 0);
+            // All three sequences use one integer temporary and two vector temporaries.  The
+            // shift is done early so as to give the register allocator the possibility of using
+            // the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
+            // `src_v`.  See https://github.com/WebAssembly/simd/pull/201 for the background and
+            // derivation of these sequences.  Alternative sequences are discussed in
+            // https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
+            // used here.
+            // Also .. FIXME: when https://github.com/bytecodealliance/wasmtime/pull/2310 is
+            // merged, use `lower_splat_constant` instead to generate the constants.
+            let tmp_r0 = ctx.alloc_tmp(RegClass::I64, I64);
+            let tmp_v0 = ctx.alloc_tmp(RegClass::V128, I8X16);
+            let tmp_v1 = ctx.alloc_tmp(RegClass::V128, I8X16);
+            match ty {
+                I8X16 => {
+                    // sshr  tmp_v1.16b, src_v.16b, #7
+                    // mov   tmp_r0, #0x0201
+                    // movk  tmp_r0, #0x0804, lsl 16
+                    // movk  tmp_r0, #0x2010, lsl 32
+                    // movk  tmp_r0, #0x8040, lsl 48
+                    // dup   tmp_v0.2d, tmp_r0
+                    // and   tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
+                    // ext   tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
+                    // zip1  tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
+                    // addv  tmp_v0h, tmp_v0.8h
+                    // mov   dst_r, tmp_v0.h[0]
+                    ctx.emit(Inst::VecShiftImm {
+                        op: VecShiftImmOp::Sshr,
+                        rd: tmp_v1,
+                        rn: src_v,
+                        size: VectorSize::Size8x16,
+                        imm: 7,
+                    });
+                    lower_constant_u64(ctx, tmp_r0, 0x8040201008040201u64);
+                    ctx.emit(Inst::VecDup {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::And,
+                        rd: tmp_v1,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v0.to_reg(),
+                        size: VectorSize::Size8x16,
+                    });
+                    ctx.emit(Inst::VecExtract {
+                        rd: tmp_v0,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v1.to_reg(),
+                        imm4: 8,
+                    });
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::Zip1,
+                        rd: tmp_v0,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v0.to_reg(),
+                        size: VectorSize::Size8x16,
+                    });
+                    ctx.emit(Inst::VecLanes {
+                        op: VecLanesOp::Addv,
+                        rd: tmp_v0,
+                        rn: tmp_v0.to_reg(),
+                        size: VectorSize::Size16x8,
+                    });
+                    ctx.emit(Inst::MovFromVec {
+                        rd: dst_r,
+                        rn: tmp_v0.to_reg(),
+                        idx: 0,
+                        size: VectorSize::Size16x8,
+                    });
+                }
+                I16X8 => {
+                    // sshr  tmp_v1.8h, src_v.8h, #15
+                    // mov   tmp_r0, #0x1
+                    // movk  tmp_r0, #0x2, lsl 16
+                    // movk  tmp_r0, #0x4, lsl 32
+                    // movk  tmp_r0, #0x8, lsl 48
+                    // dup   tmp_v0.2d, tmp_r0
+                    // shl   tmp_r0, tmp_r0, #4
+                    // mov   tmp_v0.d[1], tmp_r0
+                    // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
+                    // addv  tmp_v0h, tmp_v0.8h
+                    // mov   dst_r, tmp_v0.h[0]
+                    ctx.emit(Inst::VecShiftImm {
+                        op: VecShiftImmOp::Sshr,
+                        rd: tmp_v1,
+                        rn: src_v,
+                        size: VectorSize::Size16x8,
+                        imm: 15,
+                    });
+                    lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
+                    ctx.emit(Inst::VecDup {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: ALUOp::Lsl64,
+                        rd: tmp_r0,
+                        rn: tmp_r0.to_reg(),
+                        immshift: ImmShift { imm: 4 },
+                    });
+                    ctx.emit(Inst::MovToVec {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        idx: 1,
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::And,
+                        rd: tmp_v0,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v0.to_reg(),
+                        size: VectorSize::Size8x16,
+                    });
+                    ctx.emit(Inst::VecLanes {
+                        op: VecLanesOp::Addv,
+                        rd: tmp_v0,
+                        rn: tmp_v0.to_reg(),
+                        size: VectorSize::Size16x8,
+                    });
+                    ctx.emit(Inst::MovFromVec {
+                        rd: dst_r,
+                        rn: tmp_v0.to_reg(),
+                        idx: 0,
+                        size: VectorSize::Size16x8,
+                    });
+                }
+                I32X4 => {
+                    // sshr  tmp_v1.4s, src_v.4s, #31
+                    // mov   tmp_r0, #0x1
+                    // movk  tmp_r0, #0x2, lsl 32
+                    // dup   tmp_v0.2d, tmp_r0
+                    // shl   tmp_r0, tmp_r0, #2
+                    // mov   tmp_v0.d[1], tmp_r0
+                    // and   tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
+                    // addv  tmp_v0s, tmp_v0.4s
+                    // mov   dst_r, tmp_v0.s[0]
+                    ctx.emit(Inst::VecShiftImm {
+                        op: VecShiftImmOp::Sshr,
+                        rd: tmp_v1,
+                        rn: src_v,
+                        size: VectorSize::Size32x4,
+                        imm: 31,
+                    });
+                    lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
+                    ctx.emit(Inst::VecDup {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: ALUOp::Lsl64,
+                        rd: tmp_r0,
+                        rn: tmp_r0.to_reg(),
+                        immshift: ImmShift { imm: 2 },
+                    });
+                    ctx.emit(Inst::MovToVec {
+                        rd: tmp_v0,
+                        rn: tmp_r0.to_reg(),
+                        idx: 1,
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::VecRRR {
+                        alu_op: VecALUOp::And,
+                        rd: tmp_v0,
+                        rn: tmp_v1.to_reg(),
+                        rm: tmp_v0.to_reg(),
+                        size: VectorSize::Size8x16,
+                    });
+                    ctx.emit(Inst::VecLanes {
+                        op: VecLanesOp::Addv,
+                        rd: tmp_v0,
+                        rn: tmp_v0.to_reg(),
+                        size: VectorSize::Size32x4,
+                    });
+                    ctx.emit(Inst::MovFromVec {
+                        rd: dst_r,
+                        rn: tmp_v0.to_reg(),
+                        idx: 0,
+                        size: VectorSize::Size32x4,
+                    });
+                }
+                _ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
+            }
+        }
+
        Opcode::Shuffle => {
            let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
            let rd = get_output_reg(ctx, outputs[0]);