x64: Add support for the pblendw instruction (#6023)

This commit adds another case for `shuffle` lowering to the x64 backend for the `{,v}pblendw` instruction. This instruction selects 16-bit values from either of the inputs corresponding to an immediate 8-bit-mask where each bit selects the corresponding lane from the inputs.
2023-03-15 12:20:43 -05:00
parent fcddb9ca81
commit 6ed90f86c8
8 changed files with 132 additions and 14 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -918,6 +918,7 @@
            Punpcklqdq
            Pshuflw
            Pshufhw
+            Pblendw
          ))

 (type CmpOpcode extern
@@ -1290,6 +1291,7 @@
            Vpextrw
            Vpextrd
            Vpextrq
+            Vpblendw
          ))

 (type Avx512Opcode extern
@@ -2967,6 +2969,14 @@
      (if-let $true (has_avx))
      (xmm_rmr_blend_vex (AvxOpcode.Vpblendvb) src1 src2 mask))

+;; Helper for creating `pblendw` instructions.
+(decl x64_pblendw (Xmm XmmMem u8) Xmm)
+(rule 0 (x64_pblendw src1 src2 imm)
+      (xmm_rm_r_imm (SseOpcode.Pblendw) src1 src2 imm (OperandSize.Size32)))
+(rule 1 (x64_pblendw src1 src2 imm)
+      (if-let $true (has_avx))
+      (xmm_rmr_imm_vex (AvxOpcode.Vpblendw) src1 src2 imm))
+
 ;; Helper for creating a `movsd` instruction which creates a new vector
 ;; register where the upper 64-bits are from the first operand and the low
 ;; 64-bits are from the second operand.
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1125,6 +1125,7 @@ pub enum SseOpcode {
    Punpcklqdq,
    Pshuflw,
    Pshufhw,
+    Pblendw,
 }

 impl SseOpcode {
@@ -1318,7 +1319,8 @@ impl SseOpcode {
            | SseOpcode::Roundps
            | SseOpcode::Roundpd
            | SseOpcode::Roundss
-            | SseOpcode::Roundsd => SSE41,
+            | SseOpcode::Roundsd
+            | SseOpcode::Pblendw => SSE41,

            SseOpcode::Pcmpgtq => SSE42,
        }
@@ -1521,6 +1523,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Punpckhqdq => "punpckhqdq",
            SseOpcode::Pshuflw => "pshuflw",
            SseOpcode::Pshufhw => "pshufhw",
+            SseOpcode::Pblendw => "pblendw",
        };
        write!(fmt, "{}", name)
    }
@@ -1705,7 +1708,8 @@ impl AvxOpcode {
            | AvxOpcode::Vpextrb
            | AvxOpcode::Vpextrw
            | AvxOpcode::Vpextrd
-            | AvxOpcode::Vpextrq => {
+            | AvxOpcode::Vpextrq
+            | AvxOpcode::Vpblendw => {
                smallvec![InstructionSet::AVX]
            }
        }
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2263,6 +2263,7 @@ pub(crate) fn emit(
                AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F),
                AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21),
                AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6),
+                AvxOpcode::Vpblendw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0E),
                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
            };

@@ -2719,6 +2720,7 @@ pub(crate) fn emit(
                SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
                SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
                SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2),
+                SseOpcode::Pblendw => (LegacyPrefixes::_66, 0x0F3A0E, 3),
                _ => unimplemented!("Opcode {:?} not implemented", op),
            };
            let rex = RexFlags::from(*size);
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3704,6 +3704,15 @@

 ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+;; Special case for `pblendw` which takes an 8-bit immediate where each bit
+;; indicates which lane of the two operands is chosen for the output. A bit of
+;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the
+;; corresponding 16-bit lane from `b`.
+(rule 14 (lower (shuffle a b (pblendw_imm n)))
+         (x64_pblendw a b n))
+(decl pblendw_imm (u8) Immediate)
+(extern extractor pblendw_imm pblendw_imm)
+
 ;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
 ;; bytes", that's a `palignr` instruction. Note that the order of operands are
 ;; swapped in the instruction here. The `palignr` instruction uses the second
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -980,6 +980,41 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            None
        }
    }
+
+    fn pblendw_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // First make sure that the shuffle immediate is selecting 16-bit lanes.
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+
+        // Next build up an 8-bit mask from each of the bits of the selected
+        // lanes above. This instruction can only be used when each lane
+        // selector chooses from the corresponding lane in either of the two
+        // operands, meaning the Nth lane selection must satisfy `lane % 8 ==
+        // N`.
+        //
+        // This helper closure is used to calculate the value of the
+        // corresponding bit.
+        let bit = |x: u8, c: u8| {
+            if x % 8 == c {
+                if x < 8 {
+                    Some(0)
+                } else {
+                    Some(1 << c)
+                }
+            } else {
+                None
+            }
+        };
+        Some(
+            bit(a, 0)?
+                | bit(b, 1)?
+                | bit(c, 2)?
+                | bit(d, 3)?
+                | bit(e, 4)?
+                | bit(f, 5)?
+                | bit(g, 6)?
+                | bit(h, 7)?,
+        )
+    }
 }

 impl IsleContext<'_, '_, MInst, X64Backend> {