Add extend-add-pairwise instructions x64

This commit is contained in:
Johnnie Birch
2021-06-14 17:20:40 -07:00
parent 26c78c06ef
commit e373ddfe1b
10 changed files with 180 additions and 17 deletions

View File

@@ -190,9 +190,6 @@ fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool {
} }
match (testsuite, testname) { match (testsuite, testname) {
("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true,
("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true,
("simd", _) => return false,
_ => {} _ => {}
} }
false false
@@ -220,11 +217,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
("simd", _) if cfg!(feature = "old-x86-backend") => return true, ("simd", _) if cfg!(feature = "old-x86-backend") => return true,
// No simd support yet for s390x. // No simd support yet for s390x.
("simd", _) if platform_is_s390x() => return true, ("simd", _) if platform_is_s390x() => return true,
// These are new instructions that are not really implemented in any backend.
("simd", "simd_i16x8_extadd_pairwise_i8x16")
| ("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true,
_ => {} _ => {}
}, },
_ => panic!("unrecognized strategy"), _ => panic!("unrecognized strategy"),

View File

@@ -4114,7 +4114,34 @@ pub(crate) fn define(
Inst::new( Inst::new(
"uwiden_high", "uwiden_high",
r#" r#"
Widen the high lanes of `x` using unsigned extension. Lane-wise integer extended pairwise addition producing extended results
(twice wider results than the input)
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"extended_pairwise_add_signed",
r#"
Widen the high lanes of `x` using signed extension.
This will double the lane width and halve the number of lanes.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![a]),
);
ig.push(
Inst::new(
"extended_pairwise_add_unsigned",
r#"
Widen the high lanes of `x` extending with zeros.
This will double the lane width and halve the number of lanes. This will double the lane width and halve the number of lanes.
"#, "#,

View File

@@ -3519,7 +3519,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}); });
} }
Opcode::ConstAddr | Opcode::Vconcat | Opcode::Vsplit => unimplemented!("lowering {}", op), Opcode::ExtendedPairwiseAddSigned
| Opcode::ExtendedPairwiseAddUnsigned
| Opcode::ConstAddr
| Opcode::Vconcat
| Opcode::Vsplit => unimplemented!("lowering {}", op),
} }
Ok(()) Ok(())

View File

@@ -2868,7 +2868,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::WideningPairwiseDotProductS | Opcode::WideningPairwiseDotProductS
| Opcode::SqmulRoundSat | Opcode::SqmulRoundSat
| Opcode::FvpromoteLow | Opcode::FvpromoteLow
| Opcode::Fvdemote => { | Opcode::Fvdemote
| Opcode::ExtendedPairwiseAddSigned
| Opcode::ExtendedPairwiseAddUnsigned => {
// TODO // TODO
unimplemented!("Vector ops not implemented."); unimplemented!("Vector ops not implemented.");
} }

View File

@@ -568,6 +568,7 @@ pub enum SseOpcode {
Pinsrb, Pinsrb,
Pinsrw, Pinsrw,
Pinsrd, Pinsrd,
Pmaddubsw,
Pmaddwd, Pmaddwd,
Pmaxsb, Pmaxsb,
Pmaxsw, Pmaxsw,
@@ -746,6 +747,7 @@ impl SseOpcode {
| SseOpcode::Pcmpgtd | SseOpcode::Pcmpgtd
| SseOpcode::Pextrw | SseOpcode::Pextrw
| SseOpcode::Pinsrw | SseOpcode::Pinsrw
| SseOpcode::Pmaddubsw
| SseOpcode::Pmaddwd | SseOpcode::Pmaddwd
| SseOpcode::Pmaxsw | SseOpcode::Pmaxsw
| SseOpcode::Pmaxub | SseOpcode::Pmaxub
@@ -944,6 +946,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pinsrb => "pinsrb", SseOpcode::Pinsrb => "pinsrb",
SseOpcode::Pinsrw => "pinsrw", SseOpcode::Pinsrw => "pinsrw",
SseOpcode::Pinsrd => "pinsrd", SseOpcode::Pinsrd => "pinsrd",
SseOpcode::Pmaddubsw => "pmaddubsw",
SseOpcode::Pmaddwd => "pmaddwd", SseOpcode::Pmaddwd => "pmaddwd",
SseOpcode::Pmaxsb => "pmaxsb", SseOpcode::Pmaxsb => "pmaxsb",
SseOpcode::Pmaxsw => "pmaxsw", SseOpcode::Pmaxsw => "pmaxsw",

View File

@@ -1483,6 +1483,7 @@ pub(crate) fn emit(
SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2), SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2),
SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2), SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2),
SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2), SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2),
SseOpcode::Pmaddubsw => (LegacyPrefixes::_66, 0x0F3804, 3),
SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2), SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2),
SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2), SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2), SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),

View File

@@ -4927,6 +4927,128 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
} }
} }
Opcode::ExtendedPairwiseAddSigned | Opcode::ExtendedPairwiseAddUnsigned => {
// Extended pairwise addition instructions computes extended sums within adjacent
// pairs of lanes of a SIMD vector, producing a SIMD vector with half as many lanes.
// Instruction sequences taken from instruction SPEC PR https://github.com/WebAssembly/simd/pull/380
/*
let input_ty = ctx.input_ty(insn, 0);
let output_ty = ctx.output_ty(insn, 0);
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
unreachable!();
match op {
Opcode::ExtendedPairwiseAddSigned => match (input_ty, output_ty) {
(types::I8X16, types::I16X8) => {
static MUL_CONST: [u8; 16] = [0x01; 16];
let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST));
let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16));
ctx.emit(Inst::xmm_mov(
SseOpcode::Movdqa,
RegMem::reg(mul_const_reg.to_reg()),
dst,
));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddubsw, RegMem::reg(src), dst));
}
(types::I16X8, types::I32X4) => {
static MUL_CONST: [u8; 16] = [
0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
0x01, 0x00, 0x01, 0x00,
];
let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST));
let mul_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I16X8));
ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmaddwd,
RegMem::reg(mul_const_reg.to_reg()),
dst,
));
}
_ => unreachable!(
"Type pattern not supported {:?}-{:?} not supported for {:?}.",
input_ty, output_ty, op
),
},
Opcode::ExtendedPairwiseAddUnsigned => match (input_ty, output_ty) {
(types::I8X16, types::I16X8) => {
static MUL_CONST: [u8; 16] = [0x01; 16];
let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST));
let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16));
ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmaddubsw,
RegMem::reg(mul_const_reg.to_reg()),
dst,
));
}
(types::I16X8, types::I32X4) => {
static PXOR_CONST: [u8; 16] = [
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
0x00, 0x80, 0x00, 0x80,
];
let pxor_const =
ctx.use_constant(VCodeConstantData::WellKnown(&PXOR_CONST));
let pxor_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(
pxor_const,
pxor_const_reg,
types::I16X8,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pxor,
RegMem::reg(pxor_const_reg.to_reg()),
dst,
));
static MADD_CONST: [u8; 16] = [
0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
0x01, 0x00, 0x01, 0x00,
];
let madd_const =
ctx.use_constant(VCodeConstantData::WellKnown(&MADD_CONST));
let madd_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(
madd_const,
madd_const_reg,
types::I16X8,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmaddwd,
RegMem::reg(madd_const_reg.to_reg()),
dst,
));
static ADDD_CONST2: [u8; 16] = [
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
0x00, 0x00, 0x01, 0x00,
];
let addd_const2 =
ctx.use_constant(VCodeConstantData::WellKnown(&ADDD_CONST2));
let addd_const2_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(
addd_const2,
addd_const2_reg,
types::I16X8,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Paddd,
RegMem::reg(addd_const2_reg.to_reg()),
dst,
));
}
_ => unreachable!(
"Type pattern not supported {:?}-{:?} not supported for {:?}.",
input_ty, output_ty, op
),
},
_ => unreachable!("{:?} not supported.", op),
}
*/
}
Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => { Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => {
let input_ty = ctx.input_ty(insn, 0); let input_ty = ctx.input_ty(insn, 0);
let output_ty = ctx.output_ty(insn, 0); let output_ty = ctx.output_ty(insn, 0);

View File

@@ -630,6 +630,8 @@ where
Opcode::Fence => unimplemented!("Fence"), Opcode::Fence => unimplemented!("Fence"),
Opcode::WideningPairwiseDotProductS => unimplemented!("WideningPairwiseDotProductS"), Opcode::WideningPairwiseDotProductS => unimplemented!("WideningPairwiseDotProductS"),
Opcode::SqmulRoundSat => unimplemented!("SqmulRoundSat"), Opcode::SqmulRoundSat => unimplemented!("SqmulRoundSat"),
Opcode::ExtendedPairwiseAddSigned => unimplemented!("ExtendedPairwiseAddSigned"),
Opcode::ExtendedPairwiseAddUnsigned => unimplemented!("ExtendedPairwiseAddUnsigned"),
// TODO: these instructions should be removed once the new backend makes these obsolete // TODO: these instructions should be removed once the new backend makes these obsolete
// (see https://github.com/bytecodealliance/wasmtime/issues/1936); additionally, the // (see https://github.com/bytecodealliance/wasmtime/issues/1936); additionally, the

View File

@@ -1879,6 +1879,22 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
let a = pop1_with_bitcast(state, I32X4, builder); let a = pop1_with_bitcast(state, I32X4, builder);
state.push1(builder.ins().uwiden_high(a)) state.push1(builder.ins().uwiden_high(a))
} }
Operator::I16x8ExtAddPairwiseI8x16S => {
let a = pop1_with_bitcast(state, I8X16, builder);
state.push1(builder.ins().extended_pairwise_add_signed(a))
}
Operator::I32x4ExtAddPairwiseI16x8S => {
let a = pop1_with_bitcast(state, I16X8, builder);
state.push1(builder.ins().extended_pairwise_add_signed(a))
}
Operator::I16x8ExtAddPairwiseI8x16U => {
let a = pop1_with_bitcast(state, I8X16, builder);
state.push1(builder.ins().extended_pairwise_add_unsigned(a))
}
Operator::I32x4ExtAddPairwiseI16x8U => {
let a = pop1_with_bitcast(state, I16X8, builder);
state.push1(builder.ins().extended_pairwise_add_unsigned(a))
}
Operator::F32x4Ceil | Operator::F64x2Ceil => { Operator::F32x4Ceil | Operator::F64x2Ceil => {
// This is something of a misuse of `type_of`, because that produces the return type // This is something of a misuse of `type_of`, because that produces the return type
// of `op`. In this case we want the arg type, but we know it's the same as the // of `op`. In this case we want the arg type, but we know it's the same as the
@@ -1982,12 +1998,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
let b_high = builder.ins().uwiden_high(b); let b_high = builder.ins().uwiden_high(b);
state.push1(builder.ins().imul(a_high, b_high)); state.push1(builder.ins().imul(a_high, b_high));
} }
Operator::I16x8ExtAddPairwiseI8x16S
| Operator::I16x8ExtAddPairwiseI8x16U
| Operator::I32x4ExtAddPairwiseI16x8S
| Operator::I32x4ExtAddPairwiseI16x8U => {
return Err(wasm_unsupported!("proposed simd operator {:?}", op));
}
Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => { Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
return Err(wasm_unsupported!("proposed tail-call operator {:?}", op)); return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
} }