x64: Add support for the pblendw instruction (#6023)

This commit adds another case for `shuffle` lowering to the x64 backend for the `{,v}pblendw` instruction. This instruction selects 16-bit values from either of the inputs corresponding to an immediate 8-bit-mask where each bit selects the corresponding lane from the inputs.
2023-03-15 12:20:43 -05:00
parent fcddb9ca81
commit 6ed90f86c8
8 changed files with 132 additions and 14 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -918,6 +918,7 @@
            Punpcklqdq
            Pshuflw
            Pshufhw
+            Pblendw
          ))

 (type CmpOpcode extern
@@ -1290,6 +1291,7 @@
            Vpextrw
            Vpextrd
            Vpextrq
+            Vpblendw
          ))

 (type Avx512Opcode extern
@@ -2967,6 +2969,14 @@
      (if-let $true (has_avx))
      (xmm_rmr_blend_vex (AvxOpcode.Vpblendvb) src1 src2 mask))

+;; Helper for creating `pblendw` instructions.
+(decl x64_pblendw (Xmm XmmMem u8) Xmm)
+(rule 0 (x64_pblendw src1 src2 imm)
+      (xmm_rm_r_imm (SseOpcode.Pblendw) src1 src2 imm (OperandSize.Size32)))
+(rule 1 (x64_pblendw src1 src2 imm)
+      (if-let $true (has_avx))
+      (xmm_rmr_imm_vex (AvxOpcode.Vpblendw) src1 src2 imm))
+
 ;; Helper for creating a `movsd` instruction which creates a new vector
 ;; register where the upper 64-bits are from the first operand and the low
 ;; 64-bits are from the second operand.
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1125,6 +1125,7 @@ pub enum SseOpcode {
    Punpcklqdq,
    Pshuflw,
    Pshufhw,
+    Pblendw,
 }

 impl SseOpcode {
@@ -1318,7 +1319,8 @@ impl SseOpcode {
            | SseOpcode::Roundps
            | SseOpcode::Roundpd
            | SseOpcode::Roundss
-            | SseOpcode::Roundsd => SSE41,
+            | SseOpcode::Roundsd
+            | SseOpcode::Pblendw => SSE41,

            SseOpcode::Pcmpgtq => SSE42,
        }
@@ -1521,6 +1523,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Punpckhqdq => "punpckhqdq",
            SseOpcode::Pshuflw => "pshuflw",
            SseOpcode::Pshufhw => "pshufhw",
+            SseOpcode::Pblendw => "pblendw",
        };
        write!(fmt, "{}", name)
    }
@@ -1705,7 +1708,8 @@ impl AvxOpcode {
            | AvxOpcode::Vpextrb
            | AvxOpcode::Vpextrw
            | AvxOpcode::Vpextrd
-            | AvxOpcode::Vpextrq => {
+            | AvxOpcode::Vpextrq
+            | AvxOpcode::Vpblendw => {
                smallvec![InstructionSet::AVX]
            }
        }
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2263,6 +2263,7 @@ pub(crate) fn emit(
                AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F),
                AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21),
                AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6),
+                AvxOpcode::Vpblendw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0E),
                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
            };

@@ -2719,6 +2720,7 @@ pub(crate) fn emit(
                SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
                SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
                SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2),
+                SseOpcode::Pblendw => (LegacyPrefixes::_66, 0x0F3A0E, 3),
                _ => unimplemented!("Opcode {:?} not implemented", op),
            };
            let rex = RexFlags::from(*size);
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3704,6 +3704,15 @@

 ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+;; Special case for `pblendw` which takes an 8-bit immediate where each bit
+;; indicates which lane of the two operands is chosen for the output. A bit of
+;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the
+;; corresponding 16-bit lane from `b`.
+(rule 14 (lower (shuffle a b (pblendw_imm n)))
+         (x64_pblendw a b n))
+(decl pblendw_imm (u8) Immediate)
+(extern extractor pblendw_imm pblendw_imm)
+
 ;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
 ;; bytes", that's a `palignr` instruction. Note that the order of operands are
 ;; swapped in the instruction here. The `palignr` instruction uses the second
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -980,6 +980,41 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            None
        }
    }
+
+    fn pblendw_imm(&mut self, imm: Immediate) -> Option<u8> {
+        // First make sure that the shuffle immediate is selecting 16-bit lanes.
+        let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
+
+        // Next build up an 8-bit mask from each of the bits of the selected
+        // lanes above. This instruction can only be used when each lane
+        // selector chooses from the corresponding lane in either of the two
+        // operands, meaning the Nth lane selection must satisfy `lane % 8 ==
+        // N`.
+        //
+        // This helper closure is used to calculate the value of the
+        // corresponding bit.
+        let bit = |x: u8, c: u8| {
+            if x % 8 == c {
+                if x < 8 {
+                    Some(0)
+                } else {
+                    Some(1 << c)
+                }
+            } else {
+                None
+            }
+        };
+        Some(
+            bit(a, 0)?
+                | bit(b, 1)?
+                | bit(c, 2)?
+                | bit(d, 3)?
+                | bit(e, 4)?
+                | bit(f, 5)?
+                | bit(g, 6)?
+                | bit(h, 7)?,
+        )
+    }
 }

 impl IsleContext<'_, '_, MInst, X64Backend> {
--- a/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle-avx.clif
@@ -114,3 +114,31 @@ block0(v0: i64x2, v1: i64x2):
 ;   popq %rbp
 ;   retq

+function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpblendw $153, %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpblendw $0x99, %xmm1, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
--- a/cranelift/filetests/filetests/isa/x64/shuffle.clif
+++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif
@@ -654,9 +654,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm4
-;   movdqa  %xmm1, %xmm0
-;   palignr $0, %xmm0, %xmm4, %xmm0
+;   pblendw $0, %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -666,9 +664,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqa %xmm0, %xmm4
-;   movdqa %xmm1, %xmm0
-;   palignr $0, %xmm4, %xmm0
+;   pblendw $0, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -770,9 +766,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm4
-;   movdqa  %xmm1, %xmm0
-;   palignr $16, %xmm0, %xmm4, %xmm0
+;   pblendw $255, %xmm0, %xmm1, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -782,9 +776,35 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqa %xmm0, %xmm4
-;   movdqa %xmm1, %xmm0
-;   palignr $0x10, %xmm4, %xmm0
+;   pblendw $0xff, %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pblendw $153, %xmm0, %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   pblendw $0x99, %xmm1, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif
+++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif
@@ -553,3 +553,13 @@ block0(v0: i64x2, v1: i64x2):
    return v5
 }
 ; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0506070801020304 0x0403020108070605]
+
+function %pblendw_0b10011001(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = bitcast.i8x16 little v0
+    v3 = bitcast.i8x16 little v1
+    v4 = shuffle v2, v3, [16 17 2 3 4 5 22 23 24 25 10 11 12 13 30 31]
+    v5 = bitcast.i16x8 little v4
+    return v5
+}
+; run: %pblendw_0b10011001([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [9 2 3 12 13 6 7 16]