CL/aarch64: implement the wasm SIMD i32x4.dot_i16x8_s instruction
This patch implements, for aarch64, the following wasm SIMD extensions i32x4.dot_i16x8_s instruction https://github.com/WebAssembly/simd/pull/127 It also updates dependencies as follows, in order that the new instruction can be parsed, decoded, etc: wat to 1.0.27 wast to 26.0.1 wasmparser to 0.65.0 wasmprinter to 0.2.12 The changes are straightforward: * new CLIF instruction `widening_pairwise_dot_product_s` * translation from wasm into `widening_pairwise_dot_product_s` * new AArch64 instructions `smull`, `smull2` (part of the `VecRRR` group) * translation from `widening_pairwise_dot_product_s` to `smull ; smull2 ; addv` There is no testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
This commit is contained in:
committed by
julian-seward1
parent
54a97f784e
commit
5a5fb11979
@@ -677,6 +677,9 @@ impl VectorSize {
|
||||
}
|
||||
}
|
||||
|
||||
/// Produces a `VectorSize` with lanes twice as wide. Note that if the resulting
|
||||
/// size would exceed 128 bits, then the number of lanes is also halved, so as to
|
||||
/// ensure that the result size is at most 128 bits.
|
||||
pub fn widen(&self) -> VectorSize {
|
||||
match self {
|
||||
VectorSize::Size8x8 => VectorSize::Size16x8,
|
||||
@@ -689,6 +692,7 @@ impl VectorSize {
|
||||
}
|
||||
}
|
||||
|
||||
/// Produces a `VectorSize` that has the same lane width, but half as many lanes.
|
||||
pub fn halve(&self) -> VectorSize {
|
||||
match self {
|
||||
VectorSize::Size8x16 => VectorSize::Size8x8,
|
||||
|
||||
@@ -1950,11 +1950,13 @@ impl MachInstEmit for Inst {
|
||||
(0b001_01110_00_1 | enc_size << 1, 0b100000)
|
||||
}
|
||||
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
|
||||
VecALUOp::Smull => (0b000_01110_00_1 | enc_size << 1, 0b110000),
|
||||
VecALUOp::Smull2 => (0b010_01110_00_1 | enc_size << 1, 0b110000),
|
||||
};
|
||||
let top11 = if is_float {
|
||||
top11 | (q << 9) | enc_float_size << 1
|
||||
} else {
|
||||
top11 | (q << 9)
|
||||
let top11 = match alu_op {
|
||||
VecALUOp::Smull | VecALUOp::Smull2 => top11,
|
||||
_ if is_float => top11 | (q << 9) | enc_float_size << 1,
|
||||
_ => top11 | (q << 9),
|
||||
};
|
||||
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
|
||||
}
|
||||
|
||||
@@ -3243,6 +3243,78 @@ fn test_aarch64_binemit() {
|
||||
"zip1 v9.2d, v20.2d, v17.2d",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Smull,
|
||||
rd: writable_vreg(16),
|
||||
rn: vreg(12),
|
||||
rm: vreg(1),
|
||||
size: VectorSize::Size8x16,
|
||||
},
|
||||
"90C1210E",
|
||||
"smull v16.8h, v12.8b, v1.8b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Smull,
|
||||
rd: writable_vreg(2),
|
||||
rn: vreg(13),
|
||||
rm: vreg(6),
|
||||
size: VectorSize::Size16x8,
|
||||
},
|
||||
"A2C1660E",
|
||||
"smull v2.4s, v13.4h, v6.4h",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Smull,
|
||||
rd: writable_vreg(8),
|
||||
rn: vreg(12),
|
||||
rm: vreg(14),
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"88C1AE0E",
|
||||
"smull v8.2d, v12.2s, v14.2s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Smull2,
|
||||
rd: writable_vreg(16),
|
||||
rn: vreg(12),
|
||||
rm: vreg(1),
|
||||
size: VectorSize::Size8x16,
|
||||
},
|
||||
"90C1214E",
|
||||
"smull2 v16.8h, v12.16b, v1.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Smull2,
|
||||
rd: writable_vreg(2),
|
||||
rn: vreg(13),
|
||||
rm: vreg(6),
|
||||
size: VectorSize::Size16x8,
|
||||
},
|
||||
"A2C1664E",
|
||||
"smull2 v2.4s, v13.8h, v6.8h",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Smull2,
|
||||
rd: writable_vreg(8),
|
||||
rn: vreg(12),
|
||||
rm: vreg(14),
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"88C1AE4E",
|
||||
"smull2 v8.2d, v12.4s, v14.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Not,
|
||||
|
||||
@@ -291,6 +291,10 @@ pub enum VecALUOp {
|
||||
Umlal,
|
||||
/// Zip vectors (primary) [meaning, high halves]
|
||||
Zip1,
|
||||
/// Signed multiply long (low halves)
|
||||
Smull,
|
||||
/// Signed multiply long (high halves)
|
||||
Smull2,
|
||||
}
|
||||
|
||||
/// A Vector miscellaneous operation with two registers.
|
||||
@@ -3546,15 +3550,21 @@ impl Inst {
|
||||
VecALUOp::Addp => ("addp", size),
|
||||
VecALUOp::Umlal => ("umlal", size),
|
||||
VecALUOp::Zip1 => ("zip1", size),
|
||||
VecALUOp::Smull => ("smull", size),
|
||||
VecALUOp::Smull2 => ("smull2", size),
|
||||
};
|
||||
let rd_size = if alu_op == VecALUOp::Umlal {
|
||||
size.widen()
|
||||
} else {
|
||||
size
|
||||
let rd_size = match alu_op {
|
||||
VecALUOp::Umlal | VecALUOp::Smull | VecALUOp::Smull2 => size.widen(),
|
||||
_ => size
|
||||
};
|
||||
let rn_size = match alu_op {
|
||||
VecALUOp::Smull => size.halve(),
|
||||
_ => size
|
||||
};
|
||||
let rm_size = rn_size;
|
||||
let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
|
||||
let rn = show_vreg_vector(rn, mb_rru, size);
|
||||
let rm = show_vreg_vector(rm, mb_rru, size);
|
||||
let rn = show_vreg_vector(rn, mb_rru, rn_size);
|
||||
let rm = show_vreg_vector(rm, mb_rru, rm_size);
|
||||
format!("{} {}, {}, {}", op, rd, rn, rm)
|
||||
}
|
||||
&Inst::VecMisc { op, rd, rn, size } => {
|
||||
|
||||
@@ -2375,6 +2375,47 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
});
|
||||
}
|
||||
|
||||
Opcode::WideningPairwiseDotProductS => {
|
||||
let r_y = get_output_reg(ctx, outputs[0]);
|
||||
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||
let ty = ty.unwrap();
|
||||
if ty == I32X4 {
|
||||
let tmp = ctx.alloc_tmp(RegClass::V128, I8X16);
|
||||
// The args have type I16X8.
|
||||
// "y = i32x4.dot_i16x8_s(a, b)"
|
||||
// => smull tmp, a, b
|
||||
// smull2 y, a, b
|
||||
// addp y, tmp, y
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Smull,
|
||||
rd: tmp,
|
||||
rn: r_a,
|
||||
rm: r_b,
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Smull2,
|
||||
rd: r_y,
|
||||
rn: r_a,
|
||||
rm: r_b,
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Addp,
|
||||
rd: r_y,
|
||||
rn: tmp.to_reg(),
|
||||
rm: r_y.to_reg(),
|
||||
size: VectorSize::Size32x4,
|
||||
});
|
||||
} else {
|
||||
return Err(CodegenError::Unsupported(format!(
|
||||
"Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
|
||||
ty
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
|
||||
let ty = ty.unwrap();
|
||||
let bits = ty_bits(ty);
|
||||
|
||||
Reference in New Issue
Block a user