x64: Add support for the pblendw instruction (#6023)
This commit adds another case for `shuffle` lowering to the x64 backend
for the `{,v}pblendw` instruction. This instruction selects 16-bit
values from either of the inputs corresponding to an immediate 8-bit-mask where
each bit selects the corresponding lane from the inputs.
This commit is contained in:
@@ -918,6 +918,7 @@
|
|||||||
Punpcklqdq
|
Punpcklqdq
|
||||||
Pshuflw
|
Pshuflw
|
||||||
Pshufhw
|
Pshufhw
|
||||||
|
Pblendw
|
||||||
))
|
))
|
||||||
|
|
||||||
(type CmpOpcode extern
|
(type CmpOpcode extern
|
||||||
@@ -1290,6 +1291,7 @@
|
|||||||
Vpextrw
|
Vpextrw
|
||||||
Vpextrd
|
Vpextrd
|
||||||
Vpextrq
|
Vpextrq
|
||||||
|
Vpblendw
|
||||||
))
|
))
|
||||||
|
|
||||||
(type Avx512Opcode extern
|
(type Avx512Opcode extern
|
||||||
@@ -2967,6 +2969,14 @@
|
|||||||
(if-let $true (has_avx))
|
(if-let $true (has_avx))
|
||||||
(xmm_rmr_blend_vex (AvxOpcode.Vpblendvb) src1 src2 mask))
|
(xmm_rmr_blend_vex (AvxOpcode.Vpblendvb) src1 src2 mask))
|
||||||
|
|
||||||
|
;; Helper for creating `pblendw` instructions.
|
||||||
|
(decl x64_pblendw (Xmm XmmMem u8) Xmm)
|
||||||
|
(rule 0 (x64_pblendw src1 src2 imm)
|
||||||
|
(xmm_rm_r_imm (SseOpcode.Pblendw) src1 src2 imm (OperandSize.Size32)))
|
||||||
|
(rule 1 (x64_pblendw src1 src2 imm)
|
||||||
|
(if-let $true (has_avx))
|
||||||
|
(xmm_rmr_imm_vex (AvxOpcode.Vpblendw) src1 src2 imm))
|
||||||
|
|
||||||
;; Helper for creating a `movsd` instruction which creates a new vector
|
;; Helper for creating a `movsd` instruction which creates a new vector
|
||||||
;; register where the upper 64-bits are from the first operand and the low
|
;; register where the upper 64-bits are from the first operand and the low
|
||||||
;; 64-bits are from the second operand.
|
;; 64-bits are from the second operand.
|
||||||
|
|||||||
@@ -1125,6 +1125,7 @@ pub enum SseOpcode {
|
|||||||
Punpcklqdq,
|
Punpcklqdq,
|
||||||
Pshuflw,
|
Pshuflw,
|
||||||
Pshufhw,
|
Pshufhw,
|
||||||
|
Pblendw,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SseOpcode {
|
impl SseOpcode {
|
||||||
@@ -1318,7 +1319,8 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Roundps
|
| SseOpcode::Roundps
|
||||||
| SseOpcode::Roundpd
|
| SseOpcode::Roundpd
|
||||||
| SseOpcode::Roundss
|
| SseOpcode::Roundss
|
||||||
| SseOpcode::Roundsd => SSE41,
|
| SseOpcode::Roundsd
|
||||||
|
| SseOpcode::Pblendw => SSE41,
|
||||||
|
|
||||||
SseOpcode::Pcmpgtq => SSE42,
|
SseOpcode::Pcmpgtq => SSE42,
|
||||||
}
|
}
|
||||||
@@ -1521,6 +1523,7 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Punpckhqdq => "punpckhqdq",
|
SseOpcode::Punpckhqdq => "punpckhqdq",
|
||||||
SseOpcode::Pshuflw => "pshuflw",
|
SseOpcode::Pshuflw => "pshuflw",
|
||||||
SseOpcode::Pshufhw => "pshufhw",
|
SseOpcode::Pshufhw => "pshufhw",
|
||||||
|
SseOpcode::Pblendw => "pblendw",
|
||||||
};
|
};
|
||||||
write!(fmt, "{}", name)
|
write!(fmt, "{}", name)
|
||||||
}
|
}
|
||||||
@@ -1705,7 +1708,8 @@ impl AvxOpcode {
|
|||||||
| AvxOpcode::Vpextrb
|
| AvxOpcode::Vpextrb
|
||||||
| AvxOpcode::Vpextrw
|
| AvxOpcode::Vpextrw
|
||||||
| AvxOpcode::Vpextrd
|
| AvxOpcode::Vpextrd
|
||||||
| AvxOpcode::Vpextrq => {
|
| AvxOpcode::Vpextrq
|
||||||
|
| AvxOpcode::Vpblendw => {
|
||||||
smallvec![InstructionSet::AVX]
|
smallvec![InstructionSet::AVX]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2263,6 +2263,7 @@ pub(crate) fn emit(
|
|||||||
AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F),
|
AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F),
|
||||||
AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21),
|
AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21),
|
||||||
AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6),
|
AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6),
|
||||||
|
AvxOpcode::Vpblendw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0E),
|
||||||
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
|
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -2719,6 +2720,7 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
|
SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
|
||||||
SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
|
SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
|
||||||
SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2),
|
SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2),
|
||||||
|
SseOpcode::Pblendw => (LegacyPrefixes::_66, 0x0F3A0E, 3),
|
||||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
};
|
};
|
||||||
let rex = RexFlags::from(*size);
|
let rex = RexFlags::from(*size);
|
||||||
|
|||||||
@@ -3704,6 +3704,15 @@
|
|||||||
|
|
||||||
;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Special case for `pblendw` which takes an 8-bit immediate where each bit
|
||||||
|
;; indicates which lane of the two operands is chosen for the output. A bit of
|
||||||
|
;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the
|
||||||
|
;; corresponding 16-bit lane from `b`.
|
||||||
|
(rule 14 (lower (shuffle a b (pblendw_imm n)))
|
||||||
|
(x64_pblendw a b n))
|
||||||
|
(decl pblendw_imm (u8) Immediate)
|
||||||
|
(extern extractor pblendw_imm pblendw_imm)
|
||||||
|
|
||||||
;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
|
;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
|
||||||
;; bytes", that's a `palignr` instruction. Note that the order of operands are
|
;; bytes", that's a `palignr` instruction. Note that the order of operands are
|
||||||
;; swapped in the instruction here. The `palignr` instruction uses the second
|
;; swapped in the instruction here. The `palignr` instruction uses the second
|
||||||
|
|||||||
@@ -980,6 +980,41 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn pblendw_imm(&mut self, imm: Immediate) -> Option<u8> {
|
||||||
|
// First make sure that the shuffle immediate is selecting 16-bit lanes.
|
||||||
|
let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
|
||||||
|
|
||||||
|
// Next build up an 8-bit mask from each of the bits of the selected
|
||||||
|
// lanes above. This instruction can only be used when each lane
|
||||||
|
// selector chooses from the corresponding lane in either of the two
|
||||||
|
// operands, meaning the Nth lane selection must satisfy `lane % 8 ==
|
||||||
|
// N`.
|
||||||
|
//
|
||||||
|
// This helper closure is used to calculate the value of the
|
||||||
|
// corresponding bit.
|
||||||
|
let bit = |x: u8, c: u8| {
|
||||||
|
if x % 8 == c {
|
||||||
|
if x < 8 {
|
||||||
|
Some(0)
|
||||||
|
} else {
|
||||||
|
Some(1 << c)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Some(
|
||||||
|
bit(a, 0)?
|
||||||
|
| bit(b, 1)?
|
||||||
|
| bit(c, 2)?
|
||||||
|
| bit(d, 3)?
|
||||||
|
| bit(e, 4)?
|
||||||
|
| bit(f, 5)?
|
||||||
|
| bit(g, 6)?
|
||||||
|
| bit(h, 7)?,
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IsleContext<'_, '_, MInst, X64Backend> {
|
impl IsleContext<'_, '_, MInst, X64Backend> {
|
||||||
|
|||||||
@@ -114,3 +114,31 @@ block0(v0: i64x2, v1: i64x2):
|
|||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
|
|
||||||
|
function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v2 = bitcast.i8x16 little v0
|
||||||
|
v3 = bitcast.i8x16 little v1
|
||||||
|
v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
|
||||||
|
v5 = bitcast.i16x8 little v4
|
||||||
|
return v5
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; vpblendw $153, %xmm0, %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; vpblendw $0x99, %xmm1, %xmm0, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
|||||||
@@ -654,9 +654,7 @@ block0(v0: i8x16, v1: i8x16):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movdqa %xmm0, %xmm4
|
; pblendw $0, %xmm0, %xmm1, %xmm0
|
||||||
; movdqa %xmm1, %xmm0
|
|
||||||
; palignr $0, %xmm0, %xmm4, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -666,9 +664,7 @@ block0(v0: i8x16, v1: i8x16):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block1: ; offset 0x4
|
; block1: ; offset 0x4
|
||||||
; movdqa %xmm0, %xmm4
|
; pblendw $0, %xmm1, %xmm0
|
||||||
; movdqa %xmm1, %xmm0
|
|
||||||
; palignr $0, %xmm4, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
@@ -770,9 +766,7 @@ block0(v0: i8x16, v1: i8x16):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movdqa %xmm0, %xmm4
|
; pblendw $255, %xmm0, %xmm1, %xmm0
|
||||||
; movdqa %xmm1, %xmm0
|
|
||||||
; palignr $16, %xmm0, %xmm4, %xmm0
|
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -782,9 +776,35 @@ block0(v0: i8x16, v1: i8x16):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block1: ; offset 0x4
|
; block1: ; offset 0x4
|
||||||
; movdqa %xmm0, %xmm4
|
; pblendw $0xff, %xmm1, %xmm0
|
||||||
; movdqa %xmm1, %xmm0
|
; movq %rbp, %rsp
|
||||||
; palignr $0x10, %xmm4, %xmm0
|
; popq %rbp
|
||||||
|
; retq
|
||||||
|
|
||||||
|
function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v2 = bitcast.i8x16 little v0
|
||||||
|
v3 = bitcast.i8x16 little v1
|
||||||
|
v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
|
||||||
|
v5 = bitcast.i16x8 little v4
|
||||||
|
return v5
|
||||||
|
}
|
||||||
|
|
||||||
|
; VCode:
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pblendw $153, %xmm0, %xmm1, %xmm0
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
;
|
||||||
|
; Disassembled:
|
||||||
|
; block0: ; offset 0x0
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block1: ; offset 0x4
|
||||||
|
; pblendw $0x99, %xmm1, %xmm0
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; retq
|
; retq
|
||||||
|
|||||||
@@ -553,3 +553,13 @@ block0(v0: i64x2, v1: i64x2):
|
|||||||
return v5
|
return v5
|
||||||
}
|
}
|
||||||
; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0506070801020304 0x0403020108070605]
|
; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0506070801020304 0x0403020108070605]
|
||||||
|
|
||||||
|
function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
|
||||||
|
block0(v0: i16x8, v1: i16x8):
|
||||||
|
v2 = bitcast.i8x16 little v0
|
||||||
|
v3 = bitcast.i8x16 little v1
|
||||||
|
v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
|
||||||
|
v5 = bitcast.i16x8 little v4
|
||||||
|
return v5
|
||||||
|
}
|
||||||
|
; run: %pblendw_0b10011001([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 2 3 12 13 6 7 16]
|
||||||
|
|||||||
Reference in New Issue
Block a user