CL/aarch64 back end: implement the wasm SIMD bitmask instructions
The `bitmask.{8x16,16x8,32x4}` instructions do not map neatly to any single
AArch64 SIMD instruction, and instead need a sequence of around ten
instructions. Because of this, this patch is somewhat longer and more complex
than it would be for (eg) x64.
Main changes are:
* the relevant testsuite test (`simd_boolean.wast`) has been enabled on aarch64.
* at the CLIF level, add a new instruction `vhigh_bits`, into which these wasm
instructions are to be translated.
* in the wasm->CLIF translation (code_translator.rs), translate into
`vhigh_bits`. This is straightforward.
* in the CLIF->AArch64 translation (lower_inst.rs), translate `vhigh_bits`
into equivalent sequences of AArch64 instructions. There is a different
sequence for each of the `{8x16, 16x8, 32x4}` variants.
All other changes are AArch64-specific, and add instruction definitions needed
by the previous step:
* Add two new families of AArch64 instructions: `VecShiftImm` (vector shift by
immediate) and `VecExtract` (effectively a double-length vector shift)
* To the existing AArch64 family `VecRRR`, add a `zip1` variant. To the
`VecLanesOp` family add an `addv` variant.
* Add supporting code for the above changes to AArch64 instructions:
- getting the register uses (`aarch64_get_regs`)
- mapping the registers (`aarch64_map_regs`)
- printing instructions
- emitting instructions (`impl MachInstEmit for Inst`). The handling of
`VecShiftImm` is a bit complex.
- emission tests for new instructions and variants.
This commit is contained in:
committed by
julian-seward1
parent
b10e027fef
commit
2702942050
@@ -2060,6 +2060,197 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
normalize_bool_result(ctx, insn, rd);
|
||||
}
|
||||
|
||||
Opcode::VhighBits => {
|
||||
let dst_r = get_output_reg(ctx, outputs[0]);
|
||||
let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let ty = ctx.input_ty(insn, 0);
|
||||
// All three sequences use one integer temporary and two vector temporaries. The
|
||||
// shift is done early so as to give the register allocator the possibility of using
|
||||
// the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
|
||||
// `src_v`. See https://github.com/WebAssembly/simd/pull/201 for the background and
|
||||
// derivation of these sequences. Alternative sequences are discussed in
|
||||
// https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
|
||||
// used here.
|
||||
// Also .. FIXME: when https://github.com/bytecodealliance/wasmtime/pull/2310 is
|
||||
// merged, use `lower_splat_constant` instead to generate the constants.
|
||||
let tmp_r0 = ctx.alloc_tmp(RegClass::I64, I64);
|
||||
let tmp_v0 = ctx.alloc_tmp(RegClass::V128, I8X16);
|
||||
let tmp_v1 = ctx.alloc_tmp(RegClass::V128, I8X16);
|
||||
match ty {
|
||||
I8X16 => {
|
||||
// sshr tmp_v1.16b, src_v.16b, #7
|
||||
// mov tmp_r0, #0x0201
|
||||
// movk tmp_r0, #0x0804, lsl 16
|
||||
// movk tmp_r0, #0x2010, lsl 32
|
||||
// movk tmp_r0, #0x8040, lsl 48
|
||||
// dup tmp_v0.2d, tmp_r0
|
||||
// and tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
|
||||
// ext tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
|
||||
// zip1 tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
|
||||
// addv tmp_v0h, tmp_v0.8h
|
||||
// mov dst_r, tmp_v0.h[0]
|
||||
ctx.emit(Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Sshr,
|
||||
rd: tmp_v1,
|
||||
rn: src_v,
|
||||
size: VectorSize::Size8x16,
|
||||
imm: 7,
|
||||
});
|
||||
lower_constant_u64(ctx, tmp_r0, 0x8040201008040201u64);
|
||||
ctx.emit(Inst::VecDup {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::And,
|
||||
rd: tmp_v1,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size8x16,
|
||||
});
|
||||
ctx.emit(Inst::VecExtract {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v1.to_reg(),
|
||||
imm4: 8,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Zip1,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size8x16,
|
||||
});
|
||||
ctx.emit(Inst::VecLanes {
|
||||
op: VecLanesOp::Addv,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
ctx.emit(Inst::MovFromVec {
|
||||
rd: dst_r,
|
||||
rn: tmp_v0.to_reg(),
|
||||
idx: 0,
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
}
|
||||
I16X8 => {
|
||||
// sshr tmp_v1.8h, src_v.8h, #15
|
||||
// mov tmp_r0, #0x1
|
||||
// movk tmp_r0, #0x2, lsl 16
|
||||
// movk tmp_r0, #0x4, lsl 32
|
||||
// movk tmp_r0, #0x8, lsl 48
|
||||
// dup tmp_v0.2d, tmp_r0
|
||||
// shl tmp_r0, tmp_r0, #4
|
||||
// mov tmp_v0.d[1], tmp_r0
|
||||
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
|
||||
// addv tmp_v0h, tmp_v0.8h
|
||||
// mov dst_r, tmp_v0.h[0]
|
||||
ctx.emit(Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Sshr,
|
||||
rd: tmp_v1,
|
||||
rn: src_v,
|
||||
size: VectorSize::Size16x8,
|
||||
imm: 15,
|
||||
});
|
||||
lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
|
||||
ctx.emit(Inst::VecDup {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::AluRRImmShift {
|
||||
alu_op: ALUOp::Lsl64,
|
||||
rd: tmp_r0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
immshift: ImmShift { imm: 4 },
|
||||
});
|
||||
ctx.emit(Inst::MovToVec {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
idx: 1,
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::And,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size8x16,
|
||||
});
|
||||
ctx.emit(Inst::VecLanes {
|
||||
op: VecLanesOp::Addv,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
ctx.emit(Inst::MovFromVec {
|
||||
rd: dst_r,
|
||||
rn: tmp_v0.to_reg(),
|
||||
idx: 0,
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
}
|
||||
I32X4 => {
|
||||
// sshr tmp_v1.4s, src_v.4s, #31
|
||||
// mov tmp_r0, #0x1
|
||||
// movk tmp_r0, #0x2, lsl 32
|
||||
// dup tmp_v0.2d, tmp_r0
|
||||
// shl tmp_r0, tmp_r0, #2
|
||||
// mov tmp_v0.d[1], tmp_r0
|
||||
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
|
||||
// addv tmp_v0s, tmp_v0.4s
|
||||
// mov dst_r, tmp_v0.s[0]
|
||||
ctx.emit(Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Sshr,
|
||||
rd: tmp_v1,
|
||||
rn: src_v,
|
||||
size: VectorSize::Size32x4,
|
||||
imm: 31,
|
||||
});
|
||||
lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
|
||||
ctx.emit(Inst::VecDup {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::AluRRImmShift {
|
||||
alu_op: ALUOp::Lsl64,
|
||||
rd: tmp_r0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
immshift: ImmShift { imm: 2 },
|
||||
});
|
||||
ctx.emit(Inst::MovToVec {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
idx: 1,
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::And,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size8x16,
|
||||
});
|
||||
ctx.emit(Inst::VecLanes {
|
||||
op: VecLanesOp::Addv,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size32x4,
|
||||
});
|
||||
ctx.emit(Inst::MovFromVec {
|
||||
rd: dst_r,
|
||||
rn: tmp_v0.to_reg(),
|
||||
idx: 0,
|
||||
size: VectorSize::Size32x4,
|
||||
});
|
||||
}
|
||||
_ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Shuffle => {
|
||||
let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
|
||||
Reference in New Issue
Block a user