CL/aarch64: implement the wasm SIMD i32x4.dot_i16x8_s instruction

This patch implements, for aarch64, the following wasm SIMD extensions

  i32x4.dot_i16x8_s instruction
  https://github.com/WebAssembly/simd/pull/127

It also updates dependencies as follows, in order that the new instruction can
be parsed, decoded, etc:

  wat          to  1.0.27
  wast         to  26.0.1
  wasmparser   to  0.65.0
  wasmprinter  to  0.2.12

The changes are straightforward:

* new CLIF instruction `widening_pairwise_dot_product_s`

* translation from wasm into `widening_pairwise_dot_product_s`

* new AArch64 instructions `smull`, `smull2` (part of the `VecRRR` group)

* translation from `widening_pairwise_dot_product_s` to `smull ; smull2 ; addv`

There is no testcase in this commit, because that is a separate repo.  The
implementation has been tested, nevertheless.
This commit is contained in:
Julian Seward
2020-10-27 15:04:32 +01:00
committed by julian-seward1
parent 54a97f784e
commit 5a5fb11979
26 changed files with 228 additions and 54 deletions

View File

@@ -677,6 +677,9 @@ impl VectorSize {
}
}
/// Produces a `VectorSize` with lanes twice as wide. Note that if the resulting
/// size would exceed 128 bits, then the number of lanes is also halved, so as to
/// ensure that the result size is at most 128 bits.
pub fn widen(&self) -> VectorSize {
match self {
VectorSize::Size8x8 => VectorSize::Size16x8,
@@ -689,6 +692,7 @@ impl VectorSize {
}
}
/// Produces a `VectorSize` that has the same lane width, but half as many lanes.
pub fn halve(&self) -> VectorSize {
match self {
VectorSize::Size8x16 => VectorSize::Size8x8,

View File

@@ -1950,11 +1950,13 @@ impl MachInstEmit for Inst {
(0b001_01110_00_1 | enc_size << 1, 0b100000)
}
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000),
VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000),
};
let top11 = if is_float {
top11 | (q << 9) | enc_float_size << 1
} else {
top11 | (q << 9)
let top11 = match alu_op {
VecALUOp::Smull | VecALUOp::Smull2 => top11,
_ if is_float => top11 | (q << 9) | enc_float_size << 1,
_ => top11 | (q << 9),
};
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
}

View File

@@ -3243,6 +3243,78 @@ fn test_aarch64_binemit() {
"zip1 v9.2d, v20.2d, v17.2d",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull,
rd: writable_vreg(16),
rn: vreg(12),
rm: vreg(1),
size: VectorSize::Size8x16,
},
"90C1210E",
"smull v16.8h, v12.8b, v1.8b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull,
rd: writable_vreg(2),
rn: vreg(13),
rm: vreg(6),
size: VectorSize::Size16x8,
},
"A2C1660E",
"smull v2.4s, v13.4h, v6.4h",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull,
rd: writable_vreg(8),
rn: vreg(12),
rm: vreg(14),
size: VectorSize::Size32x4,
},
"88C1AE0E",
"smull v8.2d, v12.2s, v14.2s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull2,
rd: writable_vreg(16),
rn: vreg(12),
rm: vreg(1),
size: VectorSize::Size8x16,
},
"90C1214E",
"smull2 v16.8h, v12.16b, v1.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull2,
rd: writable_vreg(2),
rn: vreg(13),
rm: vreg(6),
size: VectorSize::Size16x8,
},
"A2C1664E",
"smull2 v2.4s, v13.8h, v6.8h",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Smull2,
rd: writable_vreg(8),
rn: vreg(12),
rm: vreg(14),
size: VectorSize::Size32x4,
},
"88C1AE4E",
"smull2 v8.2d, v12.4s, v14.4s",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Not,

View File

@@ -291,6 +291,10 @@ pub enum VecALUOp {
Umlal,
/// Zip vectors (primary) [meaning, high halves]
Zip1,
/// Signed multiply long (low halves)
Smull,
/// Signed multiply long (high halves)
Smull2,
}
/// A Vector miscellaneous operation with two registers.
@@ -3546,15 +3550,21 @@ impl Inst {
VecALUOp::Addp => ("addp", size),
VecALUOp::Umlal => ("umlal", size),
VecALUOp::Zip1 => ("zip1", size),
VecALUOp::Smull => ("smull", size),
VecALUOp::Smull2 => ("smull2", size),
};
let rd_size = if alu_op == VecALUOp::Umlal {
size.widen()
} else {
size
let rd_size = match alu_op {
VecALUOp::Umlal | VecALUOp::Smull | VecALUOp::Smull2 => size.widen(),
_ => size
};
let rn_size = match alu_op {
VecALUOp::Smull => size.halve(),
_ => size
};
let rm_size = rn_size;
let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
let rn = show_vreg_vector(rn, mb_rru, size);
let rm = show_vreg_vector(rm, mb_rru, size);
let rn = show_vreg_vector(rn, mb_rru, rn_size);
let rm = show_vreg_vector(rm, mb_rru, rm_size);
format!("{} {}, {}, {}", op, rd, rn, rm)
}
&Inst::VecMisc { op, rd, rn, size } => {

View File

@@ -2375,6 +2375,47 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
});
}
Opcode::WideningPairwiseDotProductS => {
let r_y = get_output_reg(ctx, outputs[0]);
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let ty = ty.unwrap();
if ty == I32X4 {
let tmp = ctx.alloc_tmp(RegClass::V128, I8X16);
// The args have type I16X8.
// "y = i32x4.dot_i16x8_s(a, b)"
// => smull tmp, a, b
// smull2 y, a, b
// addp y, tmp, y
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Smull,
rd: tmp,
rn: r_a,
rm: r_b,
size: VectorSize::Size16x8,
});
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Smull2,
rd: r_y,
rn: r_a,
rm: r_b,
size: VectorSize::Size16x8,
});
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Addp,
rd: r_y,
rn: tmp.to_reg(),
rm: r_y.to_reg(),
size: VectorSize::Size32x4,
});
} else {
return Err(CodegenError::Unsupported(format!(
"Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
ty
)));
}
}
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
let ty = ty.unwrap();
let bits = ty_bits(ty);