CL/aarch64: implement the wasm SIMD v128.load{32,64}_zero instructions.
This patch implements, for aarch64, the following wasm SIMD extensions. v128.load32_zero and v128.load64_zero instructions https://github.com/WebAssembly/simd/pull/237 The changes are straightforward: * no new CLIF instructions. They are translated into an existing CLIF scalar load followed by a CLIF `scalar_to_vector`. * the comment/specification for CLIF `scalar_to_vector` has been changed to match the actual intended semantics, per consulation with Andrew Brown. * translation from `scalar_to_vector` to aarch64 `fmov` instruction. This has been generalised slightly so as to allow both 32- and 64-bit transfers. * special-case zero in `lower_constant_f128` in order to avoid a potentially slow call to `Inst::load_fp_constant128`. * Once "Allow loads to merge into other operations during instruction selection in MachInst backends" (https://github.com/bytecodealliance/wasmtime/issues/2340) lands, we can use that functionality to pattern match the two-CLIF pair and emit a single AArch64 instruction. * A simple filetest has been added. There is no comprehensive testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
This commit is contained in:
committed by
julian-seward1
parent
285edeec3e
commit
dd9bfcefaa
@@ -179,8 +179,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
let vb = ctx.alloc_tmp(RegClass::V128, I128);
|
||||
let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
||||
let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
|
||||
ctx.emit(Inst::MovToFpu { rd: va, rn: ra });
|
||||
ctx.emit(Inst::MovToFpu { rd: vb, rn: rb });
|
||||
ctx.emit(Inst::MovToFpu {
|
||||
rd: va,
|
||||
rn: ra,
|
||||
size: ScalarSize::Size64,
|
||||
});
|
||||
ctx.emit(Inst::MovToFpu {
|
||||
rd: vb,
|
||||
rn: rb,
|
||||
size: ScalarSize::Size64,
|
||||
});
|
||||
ctx.emit(Inst::FpuRRR {
|
||||
fpu_op,
|
||||
rd: va,
|
||||
@@ -1703,7 +1711,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
}
|
||||
(false, true) => {
|
||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
|
||||
ctx.emit(Inst::MovToFpu { rd, rn });
|
||||
ctx.emit(Inst::MovToFpu {
|
||||
rd,
|
||||
rn,
|
||||
size: ScalarSize::Size64,
|
||||
});
|
||||
}
|
||||
(true, false) => {
|
||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
@@ -2056,6 +2068,26 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::ScalarToVector => {
|
||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
let input_ty = ctx.input_ty(insn, 0);
|
||||
if (input_ty == I32 && ty.unwrap() == I32X4)
|
||||
|| (input_ty == I64 && ty.unwrap() == I64X2)
|
||||
{
|
||||
ctx.emit(Inst::MovToFpu {
|
||||
rd,
|
||||
rn,
|
||||
size: ScalarSize::from_ty(input_ty),
|
||||
});
|
||||
} else {
|
||||
return Err(CodegenError::Unsupported(format!(
|
||||
"ScalarToVector: unsupported types {:?} -> {:?}",
|
||||
input_ty, ty
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::VanyTrue | Opcode::VallTrue => {
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
@@ -2341,7 +2373,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
|
||||
Opcode::Vsplit
|
||||
| Opcode::Vconcat
|
||||
| Opcode::ScalarToVector
|
||||
| Opcode::Uload8x8Complex
|
||||
| Opcode::Sload8x8Complex
|
||||
| Opcode::Uload16x4Complex
|
||||
|
||||
Reference in New Issue
Block a user