x64: Add support for phadd{w,d} instructions (#5896)
This commit adds support for the bare lowering of the `iadd_pairwise` instruction with `i16x8` and `i32x4` types on the x64 backend. These lowerings are achieved with the `phaddw` and `phaddd` instructions, respectively. Additionally AVX encodings of these instructions are added too. The motivation for these new lowerings comes from the relaxed-simd proposal which will use them in the deterministic lowering of some instructions on the x64 backend.
This commit is contained in:
@@ -860,7 +860,10 @@
|
|||||||
Ucomisd
|
Ucomisd
|
||||||
Unpcklps
|
Unpcklps
|
||||||
Xorps
|
Xorps
|
||||||
Xorpd))
|
Xorpd
|
||||||
|
Phaddw
|
||||||
|
Phaddd
|
||||||
|
))
|
||||||
|
|
||||||
(type CmpOpcode extern
|
(type CmpOpcode extern
|
||||||
(enum Cmp
|
(enum Cmp
|
||||||
@@ -1356,6 +1359,8 @@
|
|||||||
Vcvtps2pd
|
Vcvtps2pd
|
||||||
Vcvttpd2dq
|
Vcvttpd2dq
|
||||||
Vcvttps2dq
|
Vcvttps2dq
|
||||||
|
Vphaddw
|
||||||
|
Vphaddd
|
||||||
))
|
))
|
||||||
|
|
||||||
(type Avx512Opcode extern
|
(type Avx512Opcode extern
|
||||||
@@ -2482,6 +2487,22 @@
|
|||||||
(if-let $true (has_avx))
|
(if-let $true (has_avx))
|
||||||
(xmm_rmir_vex (AvxOpcode.Vpaddsw) src1 src2))
|
(xmm_rmir_vex (AvxOpcode.Vpaddsw) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `phaddw` instructions.
|
||||||
|
(decl x64_phaddw (Xmm XmmMem) Xmm)
|
||||||
|
(rule 0 (x64_phaddw src1 src2)
|
||||||
|
(xmm_rm_r (SseOpcode.Phaddw) src1 src2))
|
||||||
|
(rule 1 (x64_phaddw src1 src2)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_rmir_vex (AvxOpcode.Vphaddw) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `phaddd` instructions.
|
||||||
|
(decl x64_phaddd (Xmm XmmMem) Xmm)
|
||||||
|
(rule 0 (x64_phaddd src1 src2)
|
||||||
|
(xmm_rm_r (SseOpcode.Phaddd) src1 src2))
|
||||||
|
(rule 1 (x64_phaddd src1 src2)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_rmir_vex (AvxOpcode.Vphaddd) src1 src2))
|
||||||
|
|
||||||
;; Helper for creating `paddusb` instructions.
|
;; Helper for creating `paddusb` instructions.
|
||||||
(decl x64_paddusb (Xmm XmmMem) Xmm)
|
(decl x64_paddusb (Xmm XmmMem) Xmm)
|
||||||
(rule 0 (x64_paddusb src1 src2)
|
(rule 0 (x64_paddusb src1 src2)
|
||||||
|
|||||||
@@ -1115,6 +1115,8 @@ pub enum SseOpcode {
|
|||||||
Unpcklps,
|
Unpcklps,
|
||||||
Xorps,
|
Xorps,
|
||||||
Xorpd,
|
Xorpd,
|
||||||
|
Phaddw,
|
||||||
|
Phaddd,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SseOpcode {
|
impl SseOpcode {
|
||||||
@@ -1261,7 +1263,9 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Pabsd
|
| SseOpcode::Pabsd
|
||||||
| SseOpcode::Palignr
|
| SseOpcode::Palignr
|
||||||
| SseOpcode::Pmulhrsw
|
| SseOpcode::Pmulhrsw
|
||||||
| SseOpcode::Pshufb => SSSE3,
|
| SseOpcode::Pshufb
|
||||||
|
| SseOpcode::Phaddw
|
||||||
|
| SseOpcode::Phaddd => SSSE3,
|
||||||
|
|
||||||
SseOpcode::Blendvpd
|
SseOpcode::Blendvpd
|
||||||
| SseOpcode::Blendvps
|
| SseOpcode::Blendvps
|
||||||
@@ -1495,6 +1499,8 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Unpcklps => "unpcklps",
|
SseOpcode::Unpcklps => "unpcklps",
|
||||||
SseOpcode::Xorps => "xorps",
|
SseOpcode::Xorps => "xorps",
|
||||||
SseOpcode::Xorpd => "xorpd",
|
SseOpcode::Xorpd => "xorpd",
|
||||||
|
SseOpcode::Phaddw => "phaddw",
|
||||||
|
SseOpcode::Phaddd => "phaddd",
|
||||||
};
|
};
|
||||||
write!(fmt, "{}", name)
|
write!(fmt, "{}", name)
|
||||||
}
|
}
|
||||||
@@ -1661,7 +1667,9 @@ impl AvxOpcode {
|
|||||||
| AvxOpcode::Vcvtpd2ps
|
| AvxOpcode::Vcvtpd2ps
|
||||||
| AvxOpcode::Vcvtps2pd
|
| AvxOpcode::Vcvtps2pd
|
||||||
| AvxOpcode::Vcvttpd2dq
|
| AvxOpcode::Vcvttpd2dq
|
||||||
| AvxOpcode::Vcvttps2dq => {
|
| AvxOpcode::Vcvttps2dq
|
||||||
|
| AvxOpcode::Vphaddw
|
||||||
|
| AvxOpcode::Vphaddd => {
|
||||||
smallvec![InstructionSet::AVX]
|
smallvec![InstructionSet::AVX]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1954,6 +1954,8 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Unpcklps => (LegacyPrefixes::None, 0x0F14, 2),
|
SseOpcode::Unpcklps => (LegacyPrefixes::None, 0x0F14, 2),
|
||||||
SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
|
SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
|
||||||
SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
|
SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
|
||||||
|
SseOpcode::Phaddw => (LegacyPrefixes::_66, 0x0F3801, 3),
|
||||||
|
SseOpcode::Phaddd => (LegacyPrefixes::_66, 0x0F3802, 3),
|
||||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -2167,6 +2169,8 @@ pub(crate) fn emit(
|
|||||||
AvxOpcode::Vminsd => (LP::_F2, OM::_0F, 0x5D),
|
AvxOpcode::Vminsd => (LP::_F2, OM::_0F, 0x5D),
|
||||||
AvxOpcode::Vmaxss => (LP::_F3, OM::_0F, 0x5F),
|
AvxOpcode::Vmaxss => (LP::_F3, OM::_0F, 0x5F),
|
||||||
AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
|
AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
|
||||||
|
AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01),
|
||||||
|
AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02),
|
||||||
_ => panic!("unexpected rmir vex opcode {op:?}"),
|
_ => panic!("unexpected rmir vex opcode {op:?}"),
|
||||||
};
|
};
|
||||||
VexInstruction::new()
|
VexInstruction::new()
|
||||||
|
|||||||
@@ -3173,8 +3173,14 @@
|
|||||||
|
|
||||||
;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(rule (lower (has_type $I16X8 (iadd_pairwise x y)))
|
||||||
|
(x64_phaddw x y))
|
||||||
|
|
||||||
|
(rule (lower (has_type $I32X4 (iadd_pairwise x y)))
|
||||||
|
(x64_phaddd x y))
|
||||||
|
|
||||||
;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction
|
;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction
|
||||||
(rule (lower
|
(rule 1 (lower
|
||||||
(has_type $I16X8 (iadd_pairwise
|
(has_type $I16X8 (iadd_pairwise
|
||||||
(swiden_low val @ (value_type $I8X16))
|
(swiden_low val @ (value_type $I8X16))
|
||||||
(swiden_high val))))
|
(swiden_high val))))
|
||||||
@@ -3182,7 +3188,7 @@
|
|||||||
(x64_pmaddubsw mul_const val)))
|
(x64_pmaddubsw mul_const val)))
|
||||||
|
|
||||||
;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction
|
;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction
|
||||||
(rule (lower
|
(rule 1 (lower
|
||||||
(has_type $I32X4 (iadd_pairwise
|
(has_type $I32X4 (iadd_pairwise
|
||||||
(swiden_low val @ (value_type $I16X8))
|
(swiden_low val @ (value_type $I16X8))
|
||||||
(swiden_high val))))
|
(swiden_high val))))
|
||||||
@@ -3190,7 +3196,7 @@
|
|||||||
(x64_pmaddwd val mul_const)))
|
(x64_pmaddwd val mul_const)))
|
||||||
|
|
||||||
;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction
|
;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction
|
||||||
(rule (lower
|
(rule 1 (lower
|
||||||
(has_type $I16X8 (iadd_pairwise
|
(has_type $I16X8 (iadd_pairwise
|
||||||
(uwiden_low val @ (value_type $I8X16))
|
(uwiden_low val @ (value_type $I8X16))
|
||||||
(uwiden_high val))))
|
(uwiden_high val))))
|
||||||
@@ -3198,7 +3204,7 @@
|
|||||||
(x64_pmaddubsw val mul_const)))
|
(x64_pmaddubsw val mul_const)))
|
||||||
|
|
||||||
;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction
|
;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction
|
||||||
(rule (lower
|
(rule 1 (lower
|
||||||
(has_type $I32X4 (iadd_pairwise
|
(has_type $I32X4 (iadd_pairwise
|
||||||
(uwiden_low val @ (value_type $I16X8))
|
(uwiden_low val @ (value_type $I16X8))
|
||||||
(uwiden_high val))))
|
(uwiden_high val))))
|
||||||
@@ -3212,7 +3218,7 @@
|
|||||||
(x64_paddd dst addd_const)))
|
(x64_paddd dst addd_const)))
|
||||||
|
|
||||||
;; special case for the `i32x4.dot_i16x8_s` wasm instruction
|
;; special case for the `i32x4.dot_i16x8_s` wasm instruction
|
||||||
(rule (lower
|
(rule 1 (lower
|
||||||
(has_type $I32X4 (iadd_pairwise
|
(has_type $I32X4 (iadd_pairwise
|
||||||
(imul (swiden_low x) (swiden_low y))
|
(imul (swiden_low x) (swiden_low y))
|
||||||
(imul (swiden_high x) (swiden_high y)))))
|
(imul (swiden_high x) (swiden_high y)))))
|
||||||
|
|||||||
54
cranelift/filetests/filetests/isa/x64/iadd-pairwise-avx.clif
Normal file
54
cranelift/filetests/filetests/isa/x64/iadd-pairwise-avx.clif
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
test compile precise-output
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 has_avx
|
||||||
|
|
||||||
|
function %iadd_pairwise_i16x8(i16x8, i16x8) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v2 = iadd_pairwise v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vphaddw %xmm0, %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vphaddw %xmm1, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32x4):
|
||||||
|
v2 = iadd_pairwise v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vphaddd %xmm0, %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vphaddd %xmm1, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
54
cranelift/filetests/filetests/isa/x64/iadd-pairwise.clif
Normal file
54
cranelift/filetests/filetests/isa/x64/iadd-pairwise.clif
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
test compile precise-output
|
||||||
|
set enable_simd
|
||||||
|
target x86_64
|
||||||
|
|
||||||
|
function %iadd_pairwise_i16x8(i16x8, i16x8) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v2 = iadd_pairwise v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; phaddw %xmm0, %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; phaddw %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %iadd_pairwise_i32x4(i32x4, i32x4) -> i32x4 {
|
||||||
|
block0(v0: i32x4, v1: i32x4):
|
||||||
|
v2 = iadd_pairwise v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; phaddd %xmm0, %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; phaddd %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
Reference in New Issue
Block a user