From f05babc74414b59a8eb2d92e13ce4708c7482711 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 1 Mar 2023 15:49:00 -0600 Subject: [PATCH] x64: Add `shuffle` cases for `punpck{h,l}bw` (#5905) * x64: Add `shuffle` cases for `punpck{h,l}bw` I noticed this difference between LLVM and Cranelift for something I was looking at recently, and while it's probably not all that common I figured I'd add it here since it should be somewhat useful nevertheless. * Review feedback * Use u128 extractor instead --- cranelift/codegen/src/isa/x64/lower.isle | 10 ++++ .../filetests/filetests/isa/x64/shuffle.clif | 54 +++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 cranelift/filetests/filetests/isa/x64/shuffle.clif diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 38c1f2de79..9f83ccbfb9 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3510,6 +3510,16 @@ ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Special case for the `punpckhbw` instruction which interleaves the upper +;; lanes of the two input registers. +(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808))) + (x64_punpckhbw a b)) + +;; Special case for the `punpcklbw` instruction which interleaves the lower +;; lanes of the two input registers. +(rule 4 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000))) + (x64_punpcklbw a b)) + ;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM ;; register. We statically build `constructed_mask` to zero out any unknown lane ;; indices (may not be completely necessary: verification could fail incorrect diff --git a/cranelift/filetests/filetests/isa/x64/shuffle.clif b/cranelift/filetests/filetests/isa/x64/shuffle.clif new file mode 100644 index 0000000000..529b95cc5d --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/shuffle.clif @@ -0,0 +1,54 @@ +test compile precise-output +set enable_simd +target x86_64 + +function %punpcklbw(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, 0x17071606150514041303120211011000 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; punpcklbw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; punpcklbw %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %punpckhbw(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, 0x1f0f1e0e1d0d1c0c1b0b1a0a19091808 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; punpckhbw %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; punpckhbw %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +