Merge pull request #2440 from jlb6740/remaining_simd_conversions
Adds support for i32x4.trunc_sat_f32x4_u
This commit is contained in:
1
build.rs
1
build.rs
@@ -210,6 +210,7 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str
|
|||||||
("simd", "simd_load_splat") => return false,
|
("simd", "simd_load_splat") => return false,
|
||||||
("simd", "simd_splat") => return false,
|
("simd", "simd_splat") => return false,
|
||||||
("simd", "simd_store") => return false,
|
("simd", "simd_store") => return false,
|
||||||
|
("simd", "simd_conversions") => return false,
|
||||||
("simd", _) => return true,
|
("simd", _) => return true,
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -467,7 +467,10 @@ pub enum SseOpcode {
|
|||||||
Pabsb,
|
Pabsb,
|
||||||
Pabsw,
|
Pabsw,
|
||||||
Pabsd,
|
Pabsd,
|
||||||
|
Packssdw,
|
||||||
Packsswb,
|
Packsswb,
|
||||||
|
Packusdw,
|
||||||
|
Packuswb,
|
||||||
Paddb,
|
Paddb,
|
||||||
Paddd,
|
Paddd,
|
||||||
Paddq,
|
Paddq,
|
||||||
@@ -476,6 +479,7 @@ pub enum SseOpcode {
|
|||||||
Paddsw,
|
Paddsw,
|
||||||
Paddusb,
|
Paddusb,
|
||||||
Paddusw,
|
Paddusw,
|
||||||
|
Palignr,
|
||||||
Pand,
|
Pand,
|
||||||
Pandn,
|
Pandn,
|
||||||
Pavgb,
|
Pavgb,
|
||||||
@@ -507,6 +511,18 @@ pub enum SseOpcode {
|
|||||||
Pminuw,
|
Pminuw,
|
||||||
Pminud,
|
Pminud,
|
||||||
Pmovmskb,
|
Pmovmskb,
|
||||||
|
Pmovsxbd,
|
||||||
|
Pmovsxbw,
|
||||||
|
Pmovsxbq,
|
||||||
|
Pmovsxwd,
|
||||||
|
Pmovsxwq,
|
||||||
|
Pmovsxdq,
|
||||||
|
Pmovzxbd,
|
||||||
|
Pmovzxbw,
|
||||||
|
Pmovzxbq,
|
||||||
|
Pmovzxwd,
|
||||||
|
Pmovzxwq,
|
||||||
|
Pmovzxdq,
|
||||||
Pmulld,
|
Pmulld,
|
||||||
Pmullw,
|
Pmullw,
|
||||||
Pmuludq,
|
Pmuludq,
|
||||||
@@ -620,7 +636,9 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Mulpd
|
| SseOpcode::Mulpd
|
||||||
| SseOpcode::Mulsd
|
| SseOpcode::Mulsd
|
||||||
| SseOpcode::Orpd
|
| SseOpcode::Orpd
|
||||||
|
| SseOpcode::Packssdw
|
||||||
| SseOpcode::Packsswb
|
| SseOpcode::Packsswb
|
||||||
|
| SseOpcode::Packuswb
|
||||||
| SseOpcode::Paddb
|
| SseOpcode::Paddb
|
||||||
| SseOpcode::Paddd
|
| SseOpcode::Paddd
|
||||||
| SseOpcode::Paddq
|
| SseOpcode::Paddq
|
||||||
@@ -676,9 +694,14 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Ucomisd
|
| SseOpcode::Ucomisd
|
||||||
| SseOpcode::Xorpd => SSE2,
|
| SseOpcode::Xorpd => SSE2,
|
||||||
|
|
||||||
SseOpcode::Pabsb | SseOpcode::Pabsw | SseOpcode::Pabsd | SseOpcode::Pshufb => SSSE3,
|
SseOpcode::Pabsb
|
||||||
|
| SseOpcode::Pabsw
|
||||||
|
| SseOpcode::Pabsd
|
||||||
|
| SseOpcode::Palignr
|
||||||
|
| SseOpcode::Pshufb => SSSE3,
|
||||||
|
|
||||||
SseOpcode::Insertps
|
SseOpcode::Insertps
|
||||||
|
| SseOpcode::Packusdw
|
||||||
| SseOpcode::Pcmpeqq
|
| SseOpcode::Pcmpeqq
|
||||||
| SseOpcode::Pextrb
|
| SseOpcode::Pextrb
|
||||||
| SseOpcode::Pextrd
|
| SseOpcode::Pextrd
|
||||||
@@ -692,6 +715,18 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Pminsd
|
| SseOpcode::Pminsd
|
||||||
| SseOpcode::Pminuw
|
| SseOpcode::Pminuw
|
||||||
| SseOpcode::Pminud
|
| SseOpcode::Pminud
|
||||||
|
| SseOpcode::Pmovsxbd
|
||||||
|
| SseOpcode::Pmovsxbw
|
||||||
|
| SseOpcode::Pmovsxbq
|
||||||
|
| SseOpcode::Pmovsxwd
|
||||||
|
| SseOpcode::Pmovsxwq
|
||||||
|
| SseOpcode::Pmovsxdq
|
||||||
|
| SseOpcode::Pmovzxbd
|
||||||
|
| SseOpcode::Pmovzxbw
|
||||||
|
| SseOpcode::Pmovzxbq
|
||||||
|
| SseOpcode::Pmovzxwd
|
||||||
|
| SseOpcode::Pmovzxwq
|
||||||
|
| SseOpcode::Pmovzxdq
|
||||||
| SseOpcode::Pmulld
|
| SseOpcode::Pmulld
|
||||||
| SseOpcode::Ptest
|
| SseOpcode::Ptest
|
||||||
| SseOpcode::Roundss
|
| SseOpcode::Roundss
|
||||||
@@ -772,7 +807,10 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Pabsb => "pabsb",
|
SseOpcode::Pabsb => "pabsb",
|
||||||
SseOpcode::Pabsw => "pabsw",
|
SseOpcode::Pabsw => "pabsw",
|
||||||
SseOpcode::Pabsd => "pabsd",
|
SseOpcode::Pabsd => "pabsd",
|
||||||
|
SseOpcode::Packssdw => "packssdw",
|
||||||
SseOpcode::Packsswb => "packsswb",
|
SseOpcode::Packsswb => "packsswb",
|
||||||
|
SseOpcode::Packusdw => "packusdw",
|
||||||
|
SseOpcode::Packuswb => "packuswb",
|
||||||
SseOpcode::Paddb => "paddb",
|
SseOpcode::Paddb => "paddb",
|
||||||
SseOpcode::Paddd => "paddd",
|
SseOpcode::Paddd => "paddd",
|
||||||
SseOpcode::Paddq => "paddq",
|
SseOpcode::Paddq => "paddq",
|
||||||
@@ -781,6 +819,7 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Paddsw => "paddsw",
|
SseOpcode::Paddsw => "paddsw",
|
||||||
SseOpcode::Paddusb => "paddusb",
|
SseOpcode::Paddusb => "paddusb",
|
||||||
SseOpcode::Paddusw => "paddusw",
|
SseOpcode::Paddusw => "paddusw",
|
||||||
|
SseOpcode::Palignr => "palignr",
|
||||||
SseOpcode::Pand => "pand",
|
SseOpcode::Pand => "pand",
|
||||||
SseOpcode::Pandn => "pandn",
|
SseOpcode::Pandn => "pandn",
|
||||||
SseOpcode::Pavgb => "pavgb",
|
SseOpcode::Pavgb => "pavgb",
|
||||||
@@ -812,6 +851,18 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Pminuw => "pminuw",
|
SseOpcode::Pminuw => "pminuw",
|
||||||
SseOpcode::Pminud => "pminud",
|
SseOpcode::Pminud => "pminud",
|
||||||
SseOpcode::Pmovmskb => "pmovmskb",
|
SseOpcode::Pmovmskb => "pmovmskb",
|
||||||
|
SseOpcode::Pmovsxbd => "pmovsxbd",
|
||||||
|
SseOpcode::Pmovsxbw => "pmovsxbw",
|
||||||
|
SseOpcode::Pmovsxbq => "pmovsxbq",
|
||||||
|
SseOpcode::Pmovsxwd => "pmovsxwd",
|
||||||
|
SseOpcode::Pmovsxwq => "pmovsxwq",
|
||||||
|
SseOpcode::Pmovsxdq => "pmovsxdq",
|
||||||
|
SseOpcode::Pmovzxbd => "pmovzxbd",
|
||||||
|
SseOpcode::Pmovzxbw => "pmovzxbw",
|
||||||
|
SseOpcode::Pmovzxbq => "pmovzxbq",
|
||||||
|
SseOpcode::Pmovzxwd => "pmovzxwd",
|
||||||
|
SseOpcode::Pmovzxwq => "pmovzxwq",
|
||||||
|
SseOpcode::Pmovzxdq => "pmovzxdq",
|
||||||
SseOpcode::Pmulld => "pmulld",
|
SseOpcode::Pmulld => "pmulld",
|
||||||
SseOpcode::Pmullw => "pmullw",
|
SseOpcode::Pmullw => "pmullw",
|
||||||
SseOpcode::Pmuludq => "pmuludq",
|
SseOpcode::Pmuludq => "pmuludq",
|
||||||
|
|||||||
@@ -1781,7 +1781,10 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
|
SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
|
||||||
SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
|
SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
|
||||||
SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
|
SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
|
||||||
|
SseOpcode::Packssdw => (LegacyPrefixes::_66, 0x0F6B, 2),
|
||||||
SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
|
SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
|
||||||
|
SseOpcode::Packusdw => (LegacyPrefixes::_66, 0x0F382B, 3),
|
||||||
|
SseOpcode::Packuswb => (LegacyPrefixes::_66, 0x0F67, 2),
|
||||||
SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
|
SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
|
||||||
SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
|
SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
|
||||||
SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
|
SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
|
||||||
@@ -1802,6 +1805,18 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
|
SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
|
||||||
SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
|
SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
|
||||||
SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
|
SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
|
||||||
|
SseOpcode::Pmovsxbd => (LegacyPrefixes::_66, 0x0F3821, 3),
|
||||||
|
SseOpcode::Pmovsxbw => (LegacyPrefixes::_66, 0x0F3820, 3),
|
||||||
|
SseOpcode::Pmovsxbq => (LegacyPrefixes::_66, 0x0F3822, 3),
|
||||||
|
SseOpcode::Pmovsxwd => (LegacyPrefixes::_66, 0x0F3823, 3),
|
||||||
|
SseOpcode::Pmovsxwq => (LegacyPrefixes::_66, 0x0F3824, 3),
|
||||||
|
SseOpcode::Pmovsxdq => (LegacyPrefixes::_66, 0x0F3825, 3),
|
||||||
|
SseOpcode::Pmovzxbd => (LegacyPrefixes::_66, 0x0F3831, 3),
|
||||||
|
SseOpcode::Pmovzxbw => (LegacyPrefixes::_66, 0x0F3830, 3),
|
||||||
|
SseOpcode::Pmovzxbq => (LegacyPrefixes::_66, 0x0F3832, 3),
|
||||||
|
SseOpcode::Pmovzxwd => (LegacyPrefixes::_66, 0x0F3833, 3),
|
||||||
|
SseOpcode::Pmovzxwq => (LegacyPrefixes::_66, 0x0F3834, 3),
|
||||||
|
SseOpcode::Pmovzxdq => (LegacyPrefixes::_66, 0x0F3835, 3),
|
||||||
SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
|
SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
|
||||||
SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
|
SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
|
||||||
SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
|
SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
|
||||||
@@ -1958,6 +1973,7 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2),
|
SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2),
|
||||||
SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2),
|
SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2),
|
||||||
SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3),
|
SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3),
|
||||||
|
SseOpcode::Palignr => (LegacyPrefixes::_66, 0x0F3A0F, 3),
|
||||||
SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3),
|
SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3),
|
||||||
SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
|
SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
|
||||||
SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
|
SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
|
||||||
|
|||||||
@@ -3151,12 +3151,30 @@ fn test_x64_emit() {
|
|||||||
"pshufb %xmm11, %xmm2",
|
"pshufb %xmm11, %xmm2",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(xmm11), w_xmm12),
|
||||||
|
"66450F6BE3",
|
||||||
|
"packssdw %xmm11, %xmm12",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2),
|
Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(xmm11), w_xmm2),
|
||||||
"66410F63D3",
|
"66410F63D3",
|
||||||
"packsswb %xmm11, %xmm2",
|
"packsswb %xmm11, %xmm2",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(xmm13), w_xmm6),
|
||||||
|
"66410F382BF5",
|
||||||
|
"packusdw %xmm13, %xmm6",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(xmm9), w_xmm4),
|
||||||
|
"66410F67E1",
|
||||||
|
"packuswb %xmm9, %xmm4",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Punpckhbw, RegMem::reg(xmm3), w_xmm2),
|
Inst::xmm_rm_r(SseOpcode::Punpckhbw, RegMem::reg(xmm3), w_xmm2),
|
||||||
"660F68D3",
|
"660F68D3",
|
||||||
@@ -3183,6 +3201,81 @@ fn test_x64_emit() {
|
|||||||
"cvttps2dq %xmm9, %xmm8",
|
"cvttps2dq %xmm9, %xmm8",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
// ========================================================
|
||||||
|
// XMM_RM_R: Packed Move
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovsxbd, RegMem::reg(xmm6), w_xmm8),
|
||||||
|
"66440F3821C6",
|
||||||
|
"pmovsxbd %xmm6, %xmm8",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovsxbw, RegMem::reg(xmm9), w_xmm10),
|
||||||
|
"66450F3820D1",
|
||||||
|
"pmovsxbw %xmm9, %xmm10",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovsxbq, RegMem::reg(xmm1), w_xmm1),
|
||||||
|
"660F3822C9",
|
||||||
|
"pmovsxbq %xmm1, %xmm1",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovsxwd, RegMem::reg(xmm13), w_xmm10),
|
||||||
|
"66450F3823D5",
|
||||||
|
"pmovsxwd %xmm13, %xmm10",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovsxwq, RegMem::reg(xmm12), w_xmm12),
|
||||||
|
"66450F3824E4",
|
||||||
|
"pmovsxwq %xmm12, %xmm12",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovsxdq, RegMem::reg(xmm10), w_xmm8),
|
||||||
|
"66450F3825C2",
|
||||||
|
"pmovsxdq %xmm10, %xmm8",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovzxbd, RegMem::reg(xmm5), w_xmm6),
|
||||||
|
"660F3831F5",
|
||||||
|
"pmovzxbd %xmm5, %xmm6",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovzxbw, RegMem::reg(xmm5), w_xmm13),
|
||||||
|
"66440F3830ED",
|
||||||
|
"pmovzxbw %xmm5, %xmm13",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovzxbq, RegMem::reg(xmm10), w_xmm11),
|
||||||
|
"66450F3832DA",
|
||||||
|
"pmovzxbq %xmm10, %xmm11",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovzxwd, RegMem::reg(xmm2), w_xmm10),
|
||||||
|
"66440F3833D2",
|
||||||
|
"pmovzxwd %xmm2, %xmm10",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovzxwq, RegMem::reg(xmm7), w_xmm4),
|
||||||
|
"660F3834E7",
|
||||||
|
"pmovzxwq %xmm7, %xmm4",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pmovzxdq, RegMem::reg(xmm3), w_xmm4),
|
||||||
|
"660F3835E3",
|
||||||
|
"pmovzxdq %xmm3, %xmm4",
|
||||||
|
));
|
||||||
|
|
||||||
// XMM_Mov_R_M: float stores
|
// XMM_Mov_R_M: float stores
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12)),
|
Inst::xmm_mov_r_m(SseOpcode::Movss, xmm15, Amode::imm_reg(128, r12)),
|
||||||
@@ -3406,6 +3499,11 @@ fn test_x64_emit() {
|
|||||||
"410FC2FF00",
|
"410FC2FF00",
|
||||||
"cmpps $0, %xmm15, %xmm7",
|
"cmpps $0, %xmm15, %xmm7",
|
||||||
));
|
));
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_rm_r_imm(SseOpcode::Palignr, RegMem::reg(xmm1), w_xmm9, 3, false),
|
||||||
|
"66440F3A0FC903",
|
||||||
|
"palignr $3, %xmm1, %xmm9",
|
||||||
|
));
|
||||||
|
|
||||||
// ========================================================
|
// ========================================================
|
||||||
// Pertaining to atomics.
|
// Pertaining to atomics.
|
||||||
|
|||||||
@@ -2722,6 +2722,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
} else {
|
} else {
|
||||||
if op == Opcode::FcvtToSintSat {
|
if op == Opcode::FcvtToSintSat {
|
||||||
// Sets destination to zero if float is NaN
|
// Sets destination to zero if float is NaN
|
||||||
|
assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
|
||||||
let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
|
let tmp = ctx.alloc_tmp(RegClass::V128, types::I32X4);
|
||||||
ctx.emit(Inst::xmm_unary_rm_r(
|
ctx.emit(Inst::xmm_unary_rm_r(
|
||||||
SseOpcode::Movapd,
|
SseOpcode::Movapd,
|
||||||
@@ -2776,7 +2777,118 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
dst,
|
dst,
|
||||||
));
|
));
|
||||||
} else if op == Opcode::FcvtToUintSat {
|
} else if op == Opcode::FcvtToUintSat {
|
||||||
unimplemented!("f32x4.convert_i32x4_u");
|
// The algorithm for converting floats to unsigned ints is a little tricky. The
|
||||||
|
// complication arises because we are converting from a signed 64-bit int with a positive
|
||||||
|
// integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
|
||||||
|
// range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
|
||||||
|
// (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
|
||||||
|
// conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
|
||||||
|
// which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
|
||||||
|
// MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
|
||||||
|
// precisely INT_MAX values we can correctly account for and convert every value in this range
|
||||||
|
// if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
|
||||||
|
// every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
|
||||||
|
// After the conversion we add INT_MAX+1 back to this converted value, noting again that
|
||||||
|
// values we are trying to account for were already set to INT_MAX+1 during the original conversion.
|
||||||
|
// We simply have to create a mask and make sure we are adding together only the lanes that need
|
||||||
|
// to be accounted for. Digesting it all the steps then are:
|
||||||
|
//
|
||||||
|
// Step 1 - Account for NaN and negative floats by setting these src values to zero.
|
||||||
|
// Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
|
||||||
|
// reasons described above.
|
||||||
|
// Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
|
||||||
|
// Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
|
||||||
|
// values that were originally in the range (0..INT_MAX). This will come in handy during
|
||||||
|
// step 7 when we zero negative lanes.
|
||||||
|
// Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
|
||||||
|
// UINT_MAX that are now less than INT_MAX thanks to the subtraction.
|
||||||
|
// Step 6 - Convert the second set of values (tmp1)
|
||||||
|
// Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
|
||||||
|
// converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
|
||||||
|
// as this will allow us to properly saturate overflow lanes when adding to 0x80000000
|
||||||
|
// Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
|
||||||
|
// than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
|
||||||
|
// UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
|
||||||
|
// greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// The table below illustrates the result after each step where it matters for the converted set.
|
||||||
|
// Note the original value range (original src set) is the final dst in Step 8:
|
||||||
|
//
|
||||||
|
// Original src set:
|
||||||
|
// | Original Value Range | Step 1 | Step 3 | Step 8 |
|
||||||
|
// | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
|
||||||
|
//
|
||||||
|
// Copied src set (tmp1):
|
||||||
|
// | Step 2 | Step 4 |
|
||||||
|
// | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
|
||||||
|
//
|
||||||
|
// | Step 6 | Step 7 |
|
||||||
|
// | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
|
||||||
|
|
||||||
|
// Create temporaries
|
||||||
|
assert_eq!(types::F32X4, ctx.input_ty(insn, 0));
|
||||||
|
let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I32X4);
|
||||||
|
let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I32X4);
|
||||||
|
|
||||||
|
// Converting to unsigned int so if float src is negative or NaN
|
||||||
|
// will first set to zero.
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, input_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst));
|
||||||
|
|
||||||
|
// Set tmp2 to INT_MAX+1. It is important to note here that after it looks
|
||||||
|
// like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
|
||||||
|
// single precision IEEE-754 floats can only accurately represent contingous
|
||||||
|
// integers up to 2^23 and outside of this range it rounds to the closest
|
||||||
|
// integer that it can represent. In the case of INT_MAX, this value gets
|
||||||
|
// represented as 0x4f000000 which is the integer value (INT_MAX+1).
|
||||||
|
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2));
|
||||||
|
ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Cvtdq2ps,
|
||||||
|
RegMem::from(tmp2),
|
||||||
|
tmp2,
|
||||||
|
));
|
||||||
|
|
||||||
|
// Make a copy of these lanes and then do the first conversion.
|
||||||
|
// Overflow lanes greater than the maximum allowed signed value will
|
||||||
|
// set to 0x80000000. Negative and NaN lanes will be 0x0
|
||||||
|
ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Cvttps2dq, RegMem::from(dst), dst));
|
||||||
|
|
||||||
|
// Set lanes to src - max_signed_int
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1));
|
||||||
|
|
||||||
|
// Create mask for all positive lanes to saturate (i.e. greater than
|
||||||
|
// or equal to the maxmimum allowable unsigned int).
|
||||||
|
let cond = FcmpImm::from(FloatCC::LessThanOrEqual);
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(
|
||||||
|
SseOpcode::Cmpps,
|
||||||
|
RegMem::from(tmp1),
|
||||||
|
tmp2,
|
||||||
|
cond.encode(),
|
||||||
|
false,
|
||||||
|
));
|
||||||
|
|
||||||
|
// Convert those set of lanes that have the max_signed_int factored out.
|
||||||
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Cvttps2dq,
|
||||||
|
RegMem::from(tmp1),
|
||||||
|
tmp1,
|
||||||
|
));
|
||||||
|
|
||||||
|
// Prepare converted lanes by zeroing negative lanes and prepping lanes
|
||||||
|
// that have positive overflow (based on the mask) by setting these lanes
|
||||||
|
// to 0x7FFFFFFF
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1));
|
||||||
|
|
||||||
|
// Add this second set of converted lanes to the original to properly handle
|
||||||
|
// values greater than max signed int.
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst));
|
||||||
} else {
|
} else {
|
||||||
// Since this branch is also guarded by a check for vector types
|
// Since this branch is also guarded by a check for vector types
|
||||||
// neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
|
// neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here
|
||||||
@@ -2786,7 +2898,127 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => {
|
||||||
|
let input_ty = ctx.input_ty(insn, 0);
|
||||||
|
let output_ty = ctx.output_ty(insn, 0);
|
||||||
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
if output_ty.is_vector() {
|
||||||
|
match op {
|
||||||
|
Opcode::SwidenLow => match (input_ty, output_ty) {
|
||||||
|
(types::I8X16, types::I16X8) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, output_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxbw, RegMem::from(dst), dst));
|
||||||
|
}
|
||||||
|
(types::I16X8, types::I32X4) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, output_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxwd, RegMem::from(dst), dst));
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
Opcode::SwidenHigh => match (input_ty, output_ty) {
|
||||||
|
(types::I8X16, types::I16X8) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, output_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(
|
||||||
|
SseOpcode::Palignr,
|
||||||
|
RegMem::reg(src),
|
||||||
|
dst,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxbw, RegMem::from(dst), dst));
|
||||||
|
}
|
||||||
|
(types::I16X8, types::I32X4) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, output_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(
|
||||||
|
SseOpcode::Palignr,
|
||||||
|
RegMem::reg(src),
|
||||||
|
dst,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovsxwd, RegMem::from(dst), dst));
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
Opcode::UwidenLow => match (input_ty, output_ty) {
|
||||||
|
(types::I8X16, types::I16X8) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, output_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxbw, RegMem::from(dst), dst));
|
||||||
|
}
|
||||||
|
(types::I16X8, types::I32X4) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, output_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxwd, RegMem::from(dst), dst));
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
Opcode::UwidenHigh => match (input_ty, output_ty) {
|
||||||
|
(types::I8X16, types::I16X8) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, output_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(
|
||||||
|
SseOpcode::Palignr,
|
||||||
|
RegMem::reg(src),
|
||||||
|
dst,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxbw, RegMem::from(dst), dst));
|
||||||
|
}
|
||||||
|
(types::I16X8, types::I32X4) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, output_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(
|
||||||
|
SseOpcode::Palignr,
|
||||||
|
RegMem::reg(src),
|
||||||
|
dst,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmovzxwd, RegMem::from(dst), dst));
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
panic!("Unsupported non-vector type for widen instruction {:?}", ty);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Opcode::Snarrow | Opcode::Unarrow => {
|
||||||
|
let input_ty = ctx.input_ty(insn, 0);
|
||||||
|
let output_ty = ctx.output_ty(insn, 0);
|
||||||
|
let src1 = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
let src2 = put_input_in_reg(ctx, inputs[1]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
if output_ty.is_vector() {
|
||||||
|
match op {
|
||||||
|
Opcode::Snarrow => match (input_ty, output_ty) {
|
||||||
|
(types::I16X8, types::I8X16) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src1, input_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src2), dst));
|
||||||
|
}
|
||||||
|
(types::I32X4, types::I16X8) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src1, input_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packssdw, RegMem::reg(src2), dst));
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
Opcode::Unarrow => match (input_ty, output_ty) {
|
||||||
|
(types::I16X8, types::I8X16) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src1, input_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packuswb, RegMem::reg(src2), dst));
|
||||||
|
}
|
||||||
|
(types::I32X4, types::I16X8) => {
|
||||||
|
ctx.emit(Inst::gen_move(dst, src1, input_ty));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packusdw, RegMem::reg(src2), dst));
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
panic!("Unsupported non-vector type for widen instruction {:?}", ty);
|
||||||
|
}
|
||||||
|
}
|
||||||
Opcode::Bitcast => {
|
Opcode::Bitcast => {
|
||||||
let input_ty = ctx.input_ty(insn, 0);
|
let input_ty = ctx.input_ty(insn, 0);
|
||||||
let output_ty = ctx.output_ty(insn, 0);
|
let output_ty = ctx.output_ty(insn, 0);
|
||||||
|
|||||||
Reference in New Issue
Block a user