Add simd_extmul_* support for x64

This commit is contained in:
Johnnie Birch
2021-07-10 19:57:19 -07:00
parent 6a5a295019
commit 6fbe0b72bd
5 changed files with 436 additions and 21 deletions

View File

@@ -190,12 +190,9 @@ fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool {
match (testsuite, testname) {
("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true,
("simd", "simd_i16x8_extmul_i8x16") => return true,
("simd", "simd_i16x8_q15mulr_sat_s") => return true,
("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true,
("simd", "simd_i32x4_extmul_i16x8") => return true,
("simd", "simd_i32x4_trunc_sat_f64x2") => return true,
("simd", "simd_i64x2_extmul_i32x4") => return true,
("simd", "simd_int_to_int_extend") => return true,
("simd", _) => return false,
_ => {}
@@ -229,10 +226,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
// These are new instructions that are not really implemented in any backend.
("simd", "simd_conversions")
| ("simd", "simd_i16x8_extadd_pairwise_i8x16")
| ("simd", "simd_i16x8_extmul_i8x16")
| ("simd", "simd_i32x4_extadd_pairwise_i16x8")
| ("simd", "simd_i32x4_extmul_i16x8")
| ("simd", "simd_i64x2_extmul_i32x4") => return true,
| ("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true,
_ => {}
},

View File

@@ -593,6 +593,9 @@ pub enum SseOpcode {
Pmovzxwd,
Pmovzxwq,
Pmovzxdq,
Pmuldq,
Pmulhw,
Pmulhuw,
Pmulld,
Pmullw,
Pmuludq,
@@ -617,7 +620,9 @@ pub enum SseOpcode {
Psubusw,
Ptest,
Punpckhbw,
Punpckhwd,
Punpcklbw,
Punpcklwd,
Pxor,
Rcpss,
Roundps,
@@ -742,6 +747,8 @@ impl SseOpcode {
| SseOpcode::Pminsw
| SseOpcode::Pminub
| SseOpcode::Pmovmskb
| SseOpcode::Pmulhw
| SseOpcode::Pmulhuw
| SseOpcode::Pmullw
| SseOpcode::Pmuludq
| SseOpcode::Por
@@ -763,7 +770,9 @@ impl SseOpcode {
| SseOpcode::Psubusb
| SseOpcode::Psubusw
| SseOpcode::Punpckhbw
| SseOpcode::Punpckhwd
| SseOpcode::Punpcklbw
| SseOpcode::Punpcklwd
| SseOpcode::Pxor
| SseOpcode::Sqrtpd
| SseOpcode::Sqrtsd
@@ -808,6 +817,7 @@ impl SseOpcode {
| SseOpcode::Pmovzxwd
| SseOpcode::Pmovzxwq
| SseOpcode::Pmovzxdq
| SseOpcode::Pmuldq
| SseOpcode::Pmulld
| SseOpcode::Ptest
| SseOpcode::Roundps
@@ -953,6 +963,9 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pmovzxwd => "pmovzxwd",
SseOpcode::Pmovzxwq => "pmovzxwq",
SseOpcode::Pmovzxdq => "pmovzxdq",
SseOpcode::Pmuldq => "pmuldq",
SseOpcode::Pmulhw => "pmulhw",
SseOpcode::Pmulhuw => "pmulhuw",
SseOpcode::Pmulld => "pmulld",
SseOpcode::Pmullw => "pmullw",
SseOpcode::Pmuludq => "pmuludq",
@@ -977,7 +990,9 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Psubusw => "psubusw",
SseOpcode::Ptest => "ptest",
SseOpcode::Punpckhbw => "punpckhbw",
SseOpcode::Punpckhwd => "punpckhwd",
SseOpcode::Punpcklbw => "punpcklbw",
SseOpcode::Punpcklwd => "punpcklwd",
SseOpcode::Pxor => "pxor",
SseOpcode::Rcpss => "rcpss",
SseOpcode::Roundps => "roundps",

View File

@@ -1509,6 +1509,9 @@ pub(crate) fn emit(
SseOpcode::Pminub => (LegacyPrefixes::_66, 0x0FDA, 2),
SseOpcode::Pminuw => (LegacyPrefixes::_66, 0x0F383A, 3),
SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3),
SseOpcode::Pmuldq => (LegacyPrefixes::_66, 0x0F3828, 3),
SseOpcode::Pmulhw => (LegacyPrefixes::_66, 0x0FE5, 2),
SseOpcode::Pmulhuw => (LegacyPrefixes::_66, 0x0FE4, 2),
SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
@@ -1523,7 +1526,9 @@ pub(crate) fn emit(
SseOpcode::Psubusb => (LegacyPrefixes::_66, 0x0FD8, 2),
SseOpcode::Psubusw => (LegacyPrefixes::_66, 0x0FD9, 2),
SseOpcode::Punpckhbw => (LegacyPrefixes::_66, 0x0F68, 2),
SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2),
SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2),
SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2),
SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),

View File

@@ -1662,7 +1662,348 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::Imul => {
let ty = ty.unwrap();
if ty == types::I64X2 {
// First check for ext_mul_* instructions. Where possible ext_mul_* lowerings
// are based on optimized lowerings here: https://github.com/WebAssembly/simd/pull/376
if let Some(swiden0_high) = matches_input(ctx, inputs[0], Opcode::SwidenHigh) {
if let Some(swiden1_high) = matches_input(ctx, inputs[1], Opcode::SwidenHigh) {
let swiden_input = &[
InsnInput {
insn: swiden0_high,
input: 0,
},
InsnInput {
insn: swiden1_high,
input: 0,
},
];
let input0_ty = ctx.input_ty(swiden0_high, 0);
let input1_ty = ctx.input_ty(swiden1_high, 0);
let output_ty = ctx.output_ty(insn, 0);
let lhs = put_input_in_reg(ctx, swiden_input[0]);
let rhs = put_input_in_reg(ctx, swiden_input[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
match (input0_ty, input1_ty, output_ty) {
(types::I8X16, types::I8X16, types::I16X8) => {
// i16x8.extmul_high_i8x16_s
let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp_reg, lhs, output_ty));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Palignr,
RegMem::reg(lhs),
tmp_reg,
8,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(
SseOpcode::Pmovsxbw,
RegMem::reg(lhs),
tmp_reg,
));
ctx.emit(Inst::gen_move(dst, rhs, output_ty));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Palignr,
RegMem::reg(rhs),
dst,
8,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::reg(rhs), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmullw,
RegMem::reg(tmp_reg.to_reg()),
dst,
));
}
(types::I16X8, types::I16X8, types::I32X4) => {
// i32x4.extmul_high_i16x8_s
ctx.emit(Inst::gen_move(dst, lhs, input0_ty));
let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp_reg, lhs, input0_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(rhs), dst));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmulhw, RegMem::reg(rhs), tmp_reg));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Punpckhwd,
RegMem::from(tmp_reg),
dst,
));
}
(types::I32X4, types::I32X4, types::I64X2) => {
// i64x2.extmul_high_i32x4_s
let tmp_reg = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(lhs),
tmp_reg,
0xFA,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(rhs),
dst,
0xFA,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmuldq,
RegMem::reg(tmp_reg.to_reg()),
dst,
));
}
_ => panic!("Unsupported extmul_low_signed type"),
}
}
} else if let Some(swiden0_low) = matches_input(ctx, inputs[0], Opcode::SwidenLow) {
if let Some(swiden1_low) = matches_input(ctx, inputs[1], Opcode::SwidenLow) {
let swiden_input = &[
InsnInput {
insn: swiden0_low,
input: 0,
},
InsnInput {
insn: swiden1_low,
input: 0,
},
];
let input0_ty = ctx.input_ty(swiden0_low, 0);
let input1_ty = ctx.input_ty(swiden1_low, 0);
let output_ty = ctx.output_ty(insn, 0);
let lhs = put_input_in_reg(ctx, swiden_input[0]);
let rhs = put_input_in_reg(ctx, swiden_input[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
match (input0_ty, input1_ty, output_ty) {
(types::I8X16, types::I8X16, types::I16X8) => {
// i32x4.extmul_low_i8x16_s
let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::xmm_mov(
SseOpcode::Pmovsxbw,
RegMem::reg(lhs),
tmp_reg,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::reg(rhs), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmullw,
RegMem::reg(tmp_reg.to_reg()),
dst,
));
}
(types::I16X8, types::I16X8, types::I32X4) => {
// i32x4.extmul_low_i16x8_s
ctx.emit(Inst::gen_move(dst, lhs, input0_ty));
let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp_reg, lhs, input0_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(rhs), dst));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmulhw, RegMem::reg(rhs), tmp_reg));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Punpcklwd,
RegMem::from(tmp_reg),
dst,
));
}
(types::I32X4, types::I32X4, types::I64X2) => {
// i64x2.extmul_low_i32x4_s
let tmp_reg = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(lhs),
tmp_reg,
0x50,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(rhs),
dst,
0x50,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmuldq,
RegMem::reg(tmp_reg.to_reg()),
dst,
));
}
_ => panic!("Unsupported extmul_low_signed type"),
}
}
} else if let Some(uwiden0_high) = matches_input(ctx, inputs[0], Opcode::UwidenHigh) {
if let Some(uwiden1_high) = matches_input(ctx, inputs[1], Opcode::UwidenHigh) {
let uwiden_input = &[
InsnInput {
insn: uwiden0_high,
input: 0,
},
InsnInput {
insn: uwiden1_high,
input: 0,
},
];
let input0_ty = ctx.input_ty(uwiden0_high, 0);
let input1_ty = ctx.input_ty(uwiden1_high, 0);
let output_ty = ctx.output_ty(insn, 0);
let lhs = put_input_in_reg(ctx, uwiden_input[0]);
let rhs = put_input_in_reg(ctx, uwiden_input[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
match (input0_ty, input1_ty, output_ty) {
(types::I8X16, types::I8X16, types::I16X8) => {
// i16x8.extmul_high_i8x16_u
let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp_reg, lhs, output_ty));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Palignr,
RegMem::reg(lhs),
tmp_reg,
8,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(
SseOpcode::Pmovzxbw,
RegMem::reg(lhs),
tmp_reg,
));
ctx.emit(Inst::gen_move(dst, rhs, output_ty));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Palignr,
RegMem::reg(rhs),
dst,
8,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::reg(rhs), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmullw,
RegMem::reg(tmp_reg.to_reg()),
dst,
));
}
(types::I16X8, types::I16X8, types::I32X4) => {
// i32x4.extmul_high_i16x8_u
ctx.emit(Inst::gen_move(dst, lhs, input0_ty));
let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp_reg, lhs, input0_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(rhs), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmulhuw,
RegMem::reg(rhs),
tmp_reg,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Punpckhwd,
RegMem::from(tmp_reg),
dst,
));
}
(types::I32X4, types::I32X4, types::I64X2) => {
// i64x2.extmul_high_i32x4_u
let tmp_reg = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(lhs),
tmp_reg,
0xFA,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(rhs),
dst,
0xFA,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmuludq,
RegMem::reg(tmp_reg.to_reg()),
dst,
));
}
_ => panic!("Unsupported extmul_low_signed type"),
}
}
} else if let Some(uwiden0_low) = matches_input(ctx, inputs[0], Opcode::UwidenLow) {
if let Some(uwiden1_low) = matches_input(ctx, inputs[1], Opcode::UwidenLow) {
let uwiden_input = &[
InsnInput {
insn: uwiden0_low,
input: 0,
},
InsnInput {
insn: uwiden1_low,
input: 0,
},
];
let input0_ty = ctx.input_ty(uwiden0_low, 0);
let input1_ty = ctx.input_ty(uwiden1_low, 0);
let output_ty = ctx.output_ty(insn, 0);
let lhs = put_input_in_reg(ctx, uwiden_input[0]);
let rhs = put_input_in_reg(ctx, uwiden_input[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
match (input0_ty, input1_ty, output_ty) {
(types::I8X16, types::I8X16, types::I16X8) => {
// i16x8.extmul_low_i8x16_u
let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::xmm_mov(
SseOpcode::Pmovzxbw,
RegMem::reg(lhs),
tmp_reg,
));
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::reg(rhs), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmullw,
RegMem::reg(tmp_reg.to_reg()),
dst,
));
}
(types::I16X8, types::I16X8, types::I32X4) => {
// i32x4.extmul_low_i16x8_u
ctx.emit(Inst::gen_move(dst, lhs, input0_ty));
let tmp_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
ctx.emit(Inst::gen_move(tmp_reg, lhs, input0_ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(rhs), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmulhuw,
RegMem::reg(rhs),
tmp_reg,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Punpcklwd,
RegMem::from(tmp_reg),
dst,
));
}
(types::I32X4, types::I32X4, types::I64X2) => {
// i64x2.extmul_low_i32x4_u
let tmp_reg = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(lhs),
tmp_reg,
0x50,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
RegMem::reg(rhs),
dst,
0x50,
OperandSize::Size32,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmuludq,
RegMem::reg(tmp_reg.to_reg()),
dst,
));
}
_ => panic!("Unsupported extmul_low_signed type"),
}
}
} else if ty == types::I64X2 {
// Eventually one of these should be `input_to_reg_mem` (TODO).
let lhs = put_input_in_reg(ctx, inputs[0]);
let rhs = put_input_in_reg(ctx, inputs[1]);

View File

@@ -1911,19 +1911,79 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
state.push1(builder.ins().sqmul_round_sat(a, b))
}
Operator::I16x8ExtMulLowI8x16S
| Operator::I16x8ExtMulHighI8x16S
| Operator::I16x8ExtMulLowI8x16U
| Operator::I16x8ExtMulHighI8x16U
| Operator::I32x4ExtMulLowI16x8S
| Operator::I32x4ExtMulHighI16x8S
| Operator::I32x4ExtMulLowI16x8U
| Operator::I32x4ExtMulHighI16x8U
| Operator::I64x2ExtMulLowI32x4S
| Operator::I64x2ExtMulHighI32x4S
| Operator::I64x2ExtMulLowI32x4U
| Operator::I64x2ExtMulHighI32x4U
| Operator::I16x8ExtAddPairwiseI8x16S
Operator::I16x8ExtMulLowI8x16S => {
let (a, b) = pop2_with_bitcast(state, I8X16, builder);
let a_low = builder.ins().swiden_low(a);
let b_low = builder.ins().swiden_low(b);
state.push1(builder.ins().imul(a_low, b_low));
}
Operator::I16x8ExtMulHighI8x16S => {
let (a, b) = pop2_with_bitcast(state, I8X16, builder);
let a_high = builder.ins().swiden_high(a);
let b_high = builder.ins().swiden_high(b);
state.push1(builder.ins().imul(a_high, b_high));
}
Operator::I16x8ExtMulLowI8x16U => {
let (a, b) = pop2_with_bitcast(state, I8X16, builder);
let a_low = builder.ins().uwiden_low(a);
let b_low = builder.ins().uwiden_low(b);
state.push1(builder.ins().imul(a_low, b_low));
}
Operator::I16x8ExtMulHighI8x16U => {
let (a, b) = pop2_with_bitcast(state, I8X16, builder);
let a_high = builder.ins().uwiden_high(a);
let b_high = builder.ins().uwiden_high(b);
state.push1(builder.ins().imul(a_high, b_high));
}
Operator::I32x4ExtMulLowI16x8S => {
let (a, b) = pop2_with_bitcast(state, I16X8, builder);
let a_low = builder.ins().swiden_low(a);
let b_low = builder.ins().swiden_low(b);
state.push1(builder.ins().imul(a_low, b_low));
}
Operator::I32x4ExtMulHighI16x8S => {
let (a, b) = pop2_with_bitcast(state, I16X8, builder);
let a_high = builder.ins().swiden_high(a);
let b_high = builder.ins().swiden_high(b);
state.push1(builder.ins().imul(a_high, b_high));
}
Operator::I32x4ExtMulLowI16x8U => {
let (a, b) = pop2_with_bitcast(state, I16X8, builder);
let a_low = builder.ins().uwiden_low(a);
let b_low = builder.ins().uwiden_low(b);
state.push1(builder.ins().imul(a_low, b_low));
}
Operator::I32x4ExtMulHighI16x8U => {
let (a, b) = pop2_with_bitcast(state, I16X8, builder);
let a_high = builder.ins().uwiden_high(a);
let b_high = builder.ins().uwiden_high(b);
state.push1(builder.ins().imul(a_high, b_high));
}
Operator::I64x2ExtMulLowI32x4S => {
let (a, b) = pop2_with_bitcast(state, I32X4, builder);
let a_low = builder.ins().swiden_low(a);
let b_low = builder.ins().swiden_low(b);
state.push1(builder.ins().imul(a_low, b_low));
}
Operator::I64x2ExtMulHighI32x4S => {
let (a, b) = pop2_with_bitcast(state, I32X4, builder);
let a_high = builder.ins().swiden_high(a);
let b_high = builder.ins().swiden_high(b);
state.push1(builder.ins().imul(a_high, b_high));
}
Operator::I64x2ExtMulLowI32x4U => {
let (a, b) = pop2_with_bitcast(state, I32X4, builder);
let a_low = builder.ins().uwiden_low(a);
let b_low = builder.ins().uwiden_low(b);
state.push1(builder.ins().imul(a_low, b_low));
}
Operator::I64x2ExtMulHighI32x4U => {
let (a, b) = pop2_with_bitcast(state, I32X4, builder);
let a_high = builder.ins().uwiden_high(a);
let b_high = builder.ins().uwiden_high(b);
state.push1(builder.ins().imul(a_high, b_high));
}
Operator::I16x8ExtAddPairwiseI8x16S
| Operator::I16x8ExtAddPairwiseI8x16U
| Operator::I32x4ExtAddPairwiseI16x8S
| Operator::I32x4ExtAddPairwiseI16x8U => {