From 5ebe53a3516b9031aa58a30d99118451dbc01cd3 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 17 Mar 2023 11:18:41 -0500 Subject: [PATCH] x64: Elide more uextend with extractlane (#6045) * x64: Elide more uextend with extractlane I've confirmed locally now that `pextr{b,w,d}` all zero the upper bits of the full 64-bit register size which means that the `extractlane` operation with a zero-extend can be elided for more cases, including 8-to-64-bit casts as well as 32-to-64. This helps elide a few extra `mov`s in a loop I was looking at and had a modest corresponding increase in performance (my guess was due to the slightly decreased code size mostly as opposed to the removed `mov`s). * Remove stray file --- cranelift/codegen/src/isa/x64/inst.isle | 15 ++-- cranelift/codegen/src/isa/x64/lower.isle | 2 + .../filetests/isa/x64/uextend-elision.clif | 78 +++++++++++++++++++ 3 files changed, 86 insertions(+), 9 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index fbaac1da34..e51c9e8913 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1715,14 +1715,11 @@ (if-let $true (value32_zeros_upper32 src)) src) -;; The `extractlane` instruction, extended to `$I32`, means that either an -;; i8x16 or an i16x8 is being extracted. These are implemented with -;; the `pextr{b,w}` instruction which automatically zero the upper bits of the -;; destination register so the `uextend` in these cases can be elided. -;; -;; TODO: the documentation for `pextr{b,w}` seems to indicate it zero extends -;; to not only 32-bits but probably the whole 64-bit register. If that's the -;; case then this should match a zero-extend to any size instead of just `$I32`. +;; The `extractlane` instruction, when paired with a zero-extension, means +;; that one of `i{8x16,16x8,32x4}` is being extracted and extend. These +;; extractions are implemented with the `pextr{b,w,d}` instructions which +;; automatically zero the upper bits of the destination register so the +;; `uextend` in these cases can be elided. ;; ;; TODO: the interaction here between this rule and the "it's written far away" ;; rule to lower `extractlane` isn't great. Ideally this rule (and the other @@ -1730,7 +1727,7 @@ ;; be connected to the extractlane rules. There's some discussion of this on ;; #6022 but the gist is that there's not a lot of great options at this time, ;; so this doc block is what's here for now. -(rule 1 (extend_to_gpr src @ (extractlane _ _) $I32 (ExtendKind.Zero)) +(rule 1 (extend_to_gpr src @ (extractlane _ _) _ (ExtendKind.Zero)) src) (rule (extend_to_gpr (and val (value_type from_ty)) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index f42270f120..98f1b13750 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3808,6 +3808,8 @@ (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane))) (x64_pextrw val lane)) +;; See the note in the 8x16 case above for how this rule is connected to +;; `extend_to_gpr`. (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 32 4)) (u8_from_uimm8 lane))) (x64_pextrd val lane)) diff --git a/cranelift/filetests/filetests/isa/x64/uextend-elision.clif b/cranelift/filetests/filetests/isa/x64/uextend-elision.clif index cc277ff602..76689c7686 100644 --- a/cranelift/filetests/filetests/isa/x64/uextend-elision.clif +++ b/cranelift/filetests/filetests/isa/x64/uextend-elision.clif @@ -79,6 +79,32 @@ block0(v0: i8x16): ; popq %rbp ; retq +function %extractlane_i8x16_i64(i8x16) -> i64 { +block0(v0: i8x16): + v1 = extractlane v0, 1 + v2 = uextend.i64 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrb $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pextrb $1, %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + function %extractlane_i16x8_i32(i16x8) -> i32 { block0(v0: i16x8): v1 = extractlane v0, 1 @@ -105,3 +131,55 @@ block0(v0: i16x8): ; popq %rbp ; retq +function %extractlane_i16x8_i64(i16x8) -> i64 { +block0(v0: i16x8): + v1 = extractlane v0, 1 + v2 = uextend.i64 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrw $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pextrw $1, %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extractlane_i32x4_i64(i32x4) -> i64 { +block0(v0: i32x4): + v1 = extractlane v0, 1 + v2 = uextend.i64 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrd $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pextrd $1, %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq +