Merge pull request #3031 from jlb6740/extend-add-pairwise-x64

Add extend-add-pairwise instructions x64
This commit is contained in:
Chris Fallin
2021-07-31 21:14:49 -07:00
committed by GitHub
10 changed files with 223 additions and 33 deletions

View File

@@ -156,10 +156,8 @@ fn write_testsuite_tests(
let testname = extract_name(path);
writeln!(out, "#[test]")?;
if x64_should_panic(testsuite, &testname, strategy) {
writeln!(out, r#"#[should_panic]"#)?;
// Ignore when using QEMU for running tests (limited memory).
} else if ignore(testsuite, &testname, strategy) || (pooling && platform_is_emulated()) {
if ignore(testsuite, &testname, strategy) || (pooling && platform_is_emulated()) {
writeln!(out, "#[ignore]")?;
}
@@ -182,22 +180,6 @@ fn write_testsuite_tests(
Ok(())
}
/// For x64 backend features that are not supported yet, mark tests as panicking, so
/// they stop "passing" once the features are properly implemented.
fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool {
if !platform_is_x64() || strategy != "Cranelift" {
return false;
}
match (testsuite, testname) {
("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true,
("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true,
("simd", _) => return false,
_ => {}
}
false
}
/// Ignore tests that aren't supported yet.
fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
match strategy {
@@ -220,11 +202,13 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
("simd", _) if cfg!(feature = "old-x86-backend") => return true,
// No simd support yet for s390x.
("simd", _) if platform_is_s390x() => return true,
// These are new instructions that are not really implemented in any backend.
// These are new instructions that are only known to be supported for x64.
("simd", "simd_i16x8_extadd_pairwise_i8x16")
| ("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true,
| ("simd", "simd_i32x4_extadd_pairwise_i16x8")
if !platform_is_x64() =>
{
return true
}
_ => {}
},
_ => panic!("unrecognized strategy"),

View File

@@ -4114,9 +4114,9 @@ pub(crate) fn define(
Inst::new(
"uwiden_high",
r#"
Widen the high lanes of `x` using unsigned extension.
Widen the high lanes of `x` using unsigned extension.
This will double the lane width and halve the number of lanes.
This will double the lane width and halve the number of lanes.
"#,
&formats.unary,
)
@@ -4124,6 +4124,27 @@ pub(crate) fn define(
.operands_out(vec![a]),
);
let x = &Operand::new("x", I8or16or32xN);
let y = &Operand::new("y", I8or16or32xN);
let a = &Operand::new("a", I8or16or32xN);
ig.push(
Inst::new(
"iadd_pairwise",
r#"
Does lane-wise integer pairwise addition on two operands, putting the
combined results into a single vector result. Here a pair refers to adjacent
lanes in a vector, i.e. i*2 + (i*2+1) for i == num_lanes/2. The first operand
pairwise add results will make up the low half of the resulting vector while
the second operand pairwise add results will make up the upper half of the
resulting vector.
"#,
&formats.binary,
)
.operands_in(vec![x, y])
.operands_out(vec![a]),
);
let I16x8 = &TypeVar::new(
"I16x8",
"A SIMD vector type containing 8 integer lanes each 16 bits wide.",

View File

@@ -3519,7 +3519,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
});
}
Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => unimplemented!("lowering {}", op),
Opcode::IaddPairwise | Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => {
unimplemented!("lowering {}", op)
}
}
Ok(())

View File

@@ -2868,7 +2868,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::WideningPairwiseDotProductS
| Opcode::SqmulRoundSat
| Opcode::FvpromoteLow
| Opcode::Fvdemote => {
| Opcode::Fvdemote
| Opcode::IaddPairwise => {
// TODO
unimplemented!("Vector ops not implemented.");
}

View File

@@ -568,6 +568,7 @@ pub enum SseOpcode {
Pinsrb,
Pinsrw,
Pinsrd,
Pmaddubsw,
Pmaddwd,
Pmaxsb,
Pmaxsw,
@@ -746,6 +747,7 @@ impl SseOpcode {
| SseOpcode::Pcmpgtd
| SseOpcode::Pextrw
| SseOpcode::Pinsrw
| SseOpcode::Pmaddubsw
| SseOpcode::Pmaddwd
| SseOpcode::Pmaxsw
| SseOpcode::Pmaxub
@@ -944,6 +946,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pinsrb => "pinsrb",
SseOpcode::Pinsrw => "pinsrw",
SseOpcode::Pinsrd => "pinsrd",
SseOpcode::Pmaddubsw => "pmaddubsw",
SseOpcode::Pmaddwd => "pmaddwd",
SseOpcode::Pmaxsb => "pmaxsb",
SseOpcode::Pmaxsw => "pmaxsw",

View File

@@ -1483,6 +1483,7 @@ pub(crate) fn emit(
SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2),
SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2),
SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2),
SseOpcode::Pmaddubsw => (LegacyPrefixes::_66, 0x0F3804, 3),
SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2),
SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),

View File

@@ -4927,6 +4927,165 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}
}
Opcode::IaddPairwise => {
if let (Some(swiden_low), Some(swiden_high)) = (
matches_input(ctx, inputs[0], Opcode::SwidenLow),
matches_input(ctx, inputs[1], Opcode::SwidenHigh),
) {
let swiden_input = &[
InsnInput {
insn: swiden_low,
input: 0,
},
InsnInput {
insn: swiden_high,
input: 0,
},
];
let input_ty = ctx.input_ty(swiden_low, 0);
let output_ty = ctx.output_ty(insn, 0);
let src0 = put_input_in_reg(ctx, swiden_input[0]);
let src1 = put_input_in_reg(ctx, swiden_input[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
if src0 != src1 {
unimplemented!(
"iadd_pairwise not implemented for general case with different inputs"
);
}
match (input_ty, output_ty) {
(types::I8X16, types::I16X8) => {
static MUL_CONST: [u8; 16] = [0x01; 16];
let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST));
let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16));
ctx.emit(Inst::xmm_mov(
SseOpcode::Movdqa,
RegMem::reg(mul_const_reg.to_reg()),
dst,
));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddubsw, RegMem::reg(src0), dst));
}
(types::I16X8, types::I32X4) => {
static MUL_CONST: [u8; 16] = [
0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
0x01, 0x00, 0x01, 0x00,
];
let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST));
let mul_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I16X8));
ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src0), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmaddwd,
RegMem::reg(mul_const_reg.to_reg()),
dst,
));
}
_ => {
unimplemented!("Type not supported for {:?}", op);
}
}
} else if let (Some(uwiden_low), Some(uwiden_high)) = (
matches_input(ctx, inputs[0], Opcode::UwidenLow),
matches_input(ctx, inputs[1], Opcode::UwidenHigh),
) {
let uwiden_input = &[
InsnInput {
insn: uwiden_low,
input: 0,
},
InsnInput {
insn: uwiden_high,
input: 0,
},
];
let input_ty = ctx.input_ty(uwiden_low, 0);
let output_ty = ctx.output_ty(insn, 0);
let src0 = put_input_in_reg(ctx, uwiden_input[0]);
let src1 = put_input_in_reg(ctx, uwiden_input[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
if src0 != src1 {
unimplemented!(
"iadd_pairwise not implemented for general case with different inputs"
);
}
match (input_ty, output_ty) {
(types::I8X16, types::I16X8) => {
static MUL_CONST: [u8; 16] = [0x01; 16];
let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST));
let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16));
ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src0), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmaddubsw,
RegMem::reg(mul_const_reg.to_reg()),
dst,
));
}
(types::I16X8, types::I32X4) => {
static PXOR_CONST: [u8; 16] = [
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
0x00, 0x80, 0x00, 0x80,
];
let pxor_const =
ctx.use_constant(VCodeConstantData::WellKnown(&PXOR_CONST));
let pxor_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(
pxor_const,
pxor_const_reg,
types::I16X8,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src0), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pxor,
RegMem::reg(pxor_const_reg.to_reg()),
dst,
));
static MADD_CONST: [u8; 16] = [
0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
0x01, 0x00, 0x01, 0x00,
];
let madd_const =
ctx.use_constant(VCodeConstantData::WellKnown(&MADD_CONST));
let madd_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(
madd_const,
madd_const_reg,
types::I16X8,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmaddwd,
RegMem::reg(madd_const_reg.to_reg()),
dst,
));
static ADDD_CONST2: [u8; 16] = [
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
0x00, 0x00, 0x01, 0x00,
];
let addd_const2 =
ctx.use_constant(VCodeConstantData::WellKnown(&ADDD_CONST2));
let addd_const2_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(
addd_const2,
addd_const2_reg,
types::I16X8,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Paddd,
RegMem::reg(addd_const2_reg.to_reg()),
dst,
));
}
_ => {
unimplemented!("Type not supported for {:?}", op);
}
}
} else {
unimplemented!("Operands not supported for {:?}", op);
}
}
Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => {
let input_ty = ctx.input_ty(insn, 0);
let output_ty = ctx.output_ty(insn, 0);

View File

@@ -630,6 +630,7 @@ where
Opcode::Fence => unimplemented!("Fence"),
Opcode::WideningPairwiseDotProductS => unimplemented!("WideningPairwiseDotProductS"),
Opcode::SqmulRoundSat => unimplemented!("SqmulRoundSat"),
Opcode::IaddPairwise => unimplemented!("IaddPairwise"),
// TODO: these instructions should be removed once the new backend makes these obsolete
// (see https://github.com/bytecodealliance/wasmtime/issues/1936); additionally, the

View File

@@ -1879,6 +1879,30 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
let a = pop1_with_bitcast(state, I32X4, builder);
state.push1(builder.ins().uwiden_high(a))
}
Operator::I16x8ExtAddPairwiseI8x16S => {
let a = pop1_with_bitcast(state, I8X16, builder);
let widen_low = builder.ins().swiden_low(a);
let widen_high = builder.ins().swiden_high(a);
state.push1(builder.ins().iadd_pairwise(widen_low, widen_high));
}
Operator::I32x4ExtAddPairwiseI16x8S => {
let a = pop1_with_bitcast(state, I16X8, builder);
let widen_low = builder.ins().swiden_low(a);
let widen_high = builder.ins().swiden_high(a);
state.push1(builder.ins().iadd_pairwise(widen_low, widen_high));
}
Operator::I16x8ExtAddPairwiseI8x16U => {
let a = pop1_with_bitcast(state, I8X16, builder);
let widen_low = builder.ins().uwiden_low(a);
let widen_high = builder.ins().uwiden_high(a);
state.push1(builder.ins().iadd_pairwise(widen_low, widen_high));
}
Operator::I32x4ExtAddPairwiseI16x8U => {
let a = pop1_with_bitcast(state, I16X8, builder);
let widen_low = builder.ins().uwiden_low(a);
let widen_high = builder.ins().uwiden_high(a);
state.push1(builder.ins().iadd_pairwise(widen_low, widen_high));
}
Operator::F32x4Ceil | Operator::F64x2Ceil => {
// This is something of a misuse of `type_of`, because that produces the return type
// of `op`. In this case we want the arg type, but we know it's the same as the
@@ -1982,12 +2006,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
let b_high = builder.ins().uwiden_high(b);
state.push1(builder.ins().imul(a_high, b_high));
}
Operator::I16x8ExtAddPairwiseI8x16S
| Operator::I16x8ExtAddPairwiseI8x16U
| Operator::I32x4ExtAddPairwiseI16x8S
| Operator::I32x4ExtAddPairwiseI16x8U => {
return Err(wasm_unsupported!("proposed simd operator {:?}", op));
}
Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
}