From cae3b26623e98044c4cc96a18ef2bd50edf01da4 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 16 Feb 2023 14:47:59 -0600 Subject: [PATCH] x64: Improve codegen for vectors with constant shift amounts (#5797) I stumbled across this working on #5795 and figured this was a nice opportunity to improve the codegen here. --- cranelift/codegen/src/isa/x64/inst.isle | 4 + cranelift/codegen/src/isa/x64/lower.isle | 36 +- cranelift/codegen/src/isa/x64/lower/isle.rs | 4 + .../isa/x64/simd-bitwise-compile.clif | 363 +++++++++++++++--- 4 files changed, 344 insertions(+), 63 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index cfc2afc618..715d9fd879 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1455,6 +1455,10 @@ (decl shift_mask (Type) u32) (extern constructor shift_mask shift_mask) +;; Mask a constant with the type's shift mask +(decl shift_amount_masked (Type Imm64) u32) +(extern constructor shift_amount_masked shift_amount_masked) + ;; Extract a constant `GprMemImm.Imm` from a value operand. (decl simm32_from_value (GprMemImm) Value) (extern extractor simm32_from_value simm32_from_value) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index e8fd01f840..1275ab2ad2 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -473,7 +473,7 @@ (rule (lower (has_type ty @ $I8X16 (ishl src amt))) (let ( ;; Mask the amount to ensure wrapping behaviour - (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))) + (masked_amt RegMemImm (mask_xmm_shift ty amt)) ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; correct for half of the lanes; the others must be fixed up with ;; the mask below. @@ -515,16 +515,13 @@ ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. (rule (lower (has_type ty @ $I16X8 (ishl src amt))) - (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) - (x64_psllw src (mov_rmi_to_xmm masked_amt)))) + (x64_psllw src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) (rule (lower (has_type ty @ $I32X4 (ishl src amt))) - (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) - (x64_pslld src (mov_rmi_to_xmm masked_amt)))) + (x64_pslld src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) (rule (lower (has_type ty @ $I64X2 (ishl src amt))) - (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) - (x64_psllq src (mov_rmi_to_xmm masked_amt)))) + (x64_psllq src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -580,7 +577,7 @@ (rule (lower (has_type ty @ $I8X16 (ushr src amt))) (let ( ;; Mask the amount to ensure wrapping behaviour - (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))) + (masked_amt RegMemImm (mask_xmm_shift ty amt)) ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be ;; correct for half of the lanes; the others must be fixed up with ;; the mask below. @@ -625,16 +622,19 @@ ;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. (rule (lower (has_type ty @ $I16X8 (ushr src amt))) - (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) - (x64_psrlw src (mov_rmi_to_xmm masked_amt)))) + (x64_psrlw src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) (rule (lower (has_type ty @ $I32X4 (ushr src amt))) - (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) - (x64_psrld src (mov_rmi_to_xmm masked_amt)))) + (x64_psrld src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) (rule (lower (has_type ty @ $I64X2 (ushr src amt))) - (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) - (x64_psrlq src (mov_rmi_to_xmm masked_amt)))) + (x64_psrlq src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) + +(decl mask_xmm_shift (Type Value) RegMemImm) +(rule (mask_xmm_shift ty amt) + (gpr_to_reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) +(rule 1 (mask_xmm_shift ty (iconst n)) + (RegMemImm.Imm (shift_amount_masked ty n))) ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -701,7 +701,7 @@ (rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty)))) (let ((src_ Xmm (put_in_xmm src)) ;; Mask the amount to ensure wrapping behaviour - (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))) + (masked_amt RegMemImm (mask_xmm_shift ty amt)) ;; In order for `packsswb` later to only use the high byte of each ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to ;; fill in the upper bits appropriately. @@ -728,12 +728,10 @@ ;; that if the shift amount is in a register, it is in an XMM register. (rule (lower (has_type ty @ $I16X8 (sshr src amt))) - (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) - (x64_psraw src (mov_rmi_to_xmm masked_amt)))) + (x64_psraw src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) (rule (lower (has_type ty @ $I32X4 (sshr src amt))) - (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) - (x64_psrad src (mov_rmi_to_xmm masked_amt)))) + (x64_psrad src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) ;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older ;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 7d97d761ed..26766dc3d6 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -259,6 +259,10 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { ty.lane_bits() - 1 } + fn shift_amount_masked(&mut self, ty: Type, val: Imm64) -> u32 { + (val.bits() as u32) & self.shift_mask(ty) + } + #[inline] fn simm32_from_value(&mut self, val: Value) -> Option { let inst = self.lower_ctx.dfg().value_def(val).inst()?; diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index 056f256013..2c56dfd3c8 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -360,6 +360,117 @@ block0(v0: i32): ; addb %al, (%rax) ; addb %al, (%rax) +function %ishl_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 124 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; psllw %xmm0, $4, %xmm0 +; movdqu const(0), %xmm4 +; pand %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; psllw $4, %xmm0 +; movdqu 0xf(%rip), %xmm4 +; pand %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) + +function %ishl_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 1 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; psllw %xmm0, $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; psllw $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %ishl_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 100 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pslld %xmm0, $4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pslld $4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %ishl_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 100 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; psllq %xmm0, $36, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; psllq $0x24, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + function %ushr_i8x16_imm() -> i8x16 { block0: v0 = iconst.i32 1 @@ -373,14 +484,9 @@ block0: ; movq %rsp, %rbp ; block0: ; movdqu const(1), %xmm0 -; movl $1, %r9d -; andq %r9, $7, %r9 -; movd %r9d, %xmm5 -; psrlw %xmm0, %xmm5, %xmm0 -; lea const(0), %rsi -; shlq $4, %r9, %r9 -; movdqu 0(%rsi,%r9,1), %xmm13 -; pand %xmm0, %xmm13, %xmm0 +; psrlw %xmm0, $1, %xmm0 +; movdqu const(0), %xmm3 +; pand %xmm0, %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -390,21 +496,109 @@ block0: ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqu 0xb4(%rip), %xmm0 -; movl $1, %r9d -; andq $7, %r9 -; movd %r9d, %xmm5 -; psrlw %xmm5, %xmm0 -; leaq 0x1a(%rip), %rsi -; shlq $4, %r9 -; movdqu (%rsi, %r9), %xmm13 -; pand %xmm13, %xmm0 +; movdqu 0x34(%rip), %xmm0 +; psrlw $1, %xmm0 +; movdqu 0x17(%rip), %xmm3 +; pand %xmm3, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq ; addb %al, (%rax) ; addb %al, (%rax) -; addb %bh, %bh +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; jg 0xb1 +; jg 0xb3 +; jg 0xb5 +; jg 0xb7 +; jg 0xb9 +; jg 0xbb +; jg 0xbd +; jg 0xbf +; addb %al, (%rcx) +; addb (%rbx), %al +; addb $5, %al + +function %ushr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 1 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; psrlw %xmm0, $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; psrlw $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %ushr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 100 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; psrld %xmm0, $4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; psrld $4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %ushr_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 100 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; psrlq %xmm0, $36, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; psrlq $0x24, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq function %sshr_i8x16(i32) -> i8x16 { block0(v0: i32): @@ -465,19 +659,15 @@ block0(v0: i8x16, v1: i32): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movl $3, %r10d -; andq %r10, $7, %r10 -; movdqa %xmm0, %xmm13 -; punpcklbw %xmm13, %xmm0, %xmm13 -; movdqa %xmm13, %xmm12 -; movdqa %xmm0, %xmm13 -; punpckhbw %xmm13, %xmm0, %xmm13 -; addl %r10d, $8, %r10d -; movd %r10d, %xmm14 -; movdqa %xmm12, %xmm0 -; psraw %xmm0, %xmm14, %xmm0 -; psraw %xmm13, %xmm14, %xmm13 -; packsswb %xmm0, %xmm13, %xmm0 +; movdqa %xmm0, %xmm7 +; punpcklbw %xmm7, %xmm0, %xmm7 +; movdqa %xmm7, %xmm8 +; movdqa %xmm0, %xmm7 +; punpckhbw %xmm7, %xmm0, %xmm7 +; movdqa %xmm8, %xmm0 +; psraw %xmm0, $11, %xmm0 +; psraw %xmm7, $11, %xmm7 +; packsswb %xmm0, %xmm7, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -487,19 +677,104 @@ block0(v0: i8x16, v1: i32): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movl $3, %r10d -; andq $7, %r10 -; movdqa %xmm0, %xmm13 -; punpcklbw %xmm0, %xmm13 -; movdqa %xmm13, %xmm12 -; movdqa %xmm0, %xmm13 -; punpckhbw %xmm0, %xmm13 -; addl $8, %r10d -; movd %r10d, %xmm14 -; movdqa %xmm12, %xmm0 -; psraw %xmm14, %xmm0 -; psraw %xmm14, %xmm13 -; packsswb %xmm13, %xmm0 +; movdqa %xmm0, %xmm7 +; punpcklbw %xmm0, %xmm7 +; movdqa %xmm7, %xmm8 +; movdqa %xmm0, %xmm7 +; punpckhbw %xmm0, %xmm7 +; movdqa %xmm8, %xmm0 +; psraw $0xb, %xmm0 +; psraw $0xb, %xmm7 +; packsswb %xmm7, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sshr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 1 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; psraw %xmm0, $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; psraw $1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sshr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 100 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; psrad %xmm0, $4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; psrad $4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sshr_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 100 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrd.w $0, %xmm0, %rdx +; pextrd.w $1, %xmm0, %r9 +; sarq $36, %rdx, %rdx +; sarq $36, %r9, %r9 +; uninit %xmm0 +; pinsrd.w $0, %xmm0, %rdx, %xmm0 +; pinsrd.w $1, %xmm0, %r9, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pextrq $0, %xmm0, %rdx +; pextrq $1, %xmm0, %r9 +; sarq $0x24, %rdx +; sarq $0x24, %r9 +; pinsrq $0, %rdx, %xmm0 +; pinsrq $1, %r9, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq