CL/aarch64: implement the wasm SIMD v128.load{32,64}_zero instructions.

This patch implements, for aarch64, the following wasm SIMD extensions.

  v128.load32_zero and v128.load64_zero instructions
  https://github.com/WebAssembly/simd/pull/237

The changes are straightforward:

* no new CLIF instructions.  They are translated into an existing CLIF scalar
  load followed by a CLIF `scalar_to_vector`.

* the comment/specification for CLIF `scalar_to_vector` has been changed to
  match the actual intended semantics, per consulation with Andrew Brown.

* translation from `scalar_to_vector` to aarch64 `fmov` instruction.  This
  has been generalised slightly so as to allow both 32- and 64-bit transfers.

* special-case zero in `lower_constant_f128` in order to avoid a
  potentially slow call to `Inst::load_fp_constant128`.

* Once "Allow loads to merge into other operations during instruction
  selection in MachInst backends"
  (https://github.com/bytecodealliance/wasmtime/issues/2340) lands,
  we can use that functionality to pattern match the two-CLIF pair and
  emit a single AArch64 instruction.

* A simple filetest has been added.

There is no comprehensive testcase in this commit, because that is a separate
repo.  The implementation has been tested, nevertheless.
This commit is contained in:
Julian Seward
2020-11-03 17:34:07 +01:00
committed by julian-seward1
parent 285edeec3e
commit dd9bfcefaa
9 changed files with 144 additions and 35 deletions

View File

@@ -179,8 +179,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let vb = ctx.alloc_tmp(RegClass::V128, I128);
let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
ctx.emit(Inst::MovToFpu { rd: va, rn: ra });
ctx.emit(Inst::MovToFpu { rd: vb, rn: rb });
ctx.emit(Inst::MovToFpu {
rd: va,
rn: ra,
size: ScalarSize::Size64,
});
ctx.emit(Inst::MovToFpu {
rd: vb,
rn: rb,
size: ScalarSize::Size64,
});
ctx.emit(Inst::FpuRRR {
fpu_op,
rd: va,
@@ -1703,7 +1711,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
(false, true) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
ctx.emit(Inst::MovToFpu { rd, rn });
ctx.emit(Inst::MovToFpu {
rd,
rn,
size: ScalarSize::Size64,
});
}
(true, false) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -2056,6 +2068,26 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}
Opcode::ScalarToVector => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]);
let input_ty = ctx.input_ty(insn, 0);
if (input_ty == I32 && ty.unwrap() == I32X4)
|| (input_ty == I64 && ty.unwrap() == I64X2)
{
ctx.emit(Inst::MovToFpu {
rd,
rn,
size: ScalarSize::from_ty(input_ty),
});
} else {
return Err(CodegenError::Unsupported(format!(
"ScalarToVector: unsupported types {:?} -> {:?}",
input_ty, ty
)));
}
}
Opcode::VanyTrue | Opcode::VallTrue => {
let rd = get_output_reg(ctx, outputs[0]);
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -2341,7 +2373,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::Vsplit
| Opcode::Vconcat
| Opcode::ScalarToVector
| Opcode::Uload8x8Complex
| Opcode::Sload8x8Complex
| Opcode::Uload16x4Complex