diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 69b926f826..fbaac1da34 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1703,11 +1703,35 @@ (decl extend_to_gpr (Value Type ExtendKind) Gpr) ;; If the value is already of the requested type, no extending is necessary. +(rule 3 (extend_to_gpr val @ (value_type ty) ty _kind) + val) + +;; I32 -> I64 with op that produces a zero-extended value in a register. ;; -;; Priority 1 because the equality constraint doesn't prove that this rule -;; doesn't overlap with the one below. -(rule 1 (extend_to_gpr (and val (value_type ty)) ty _kind) - (put_in_gpr val)) +;; As a particular x64 extra-pattern matching opportunity, all the ALU +;; opcodes on 32-bits will zero-extend the upper 32-bits, so we can +;; even not generate a zero-extended move in this case. +(rule 2 (extend_to_gpr src @ (value_type $I32) $I64 (ExtendKind.Zero)) + (if-let $true (value32_zeros_upper32 src)) + src) + +;; The `extractlane` instruction, extended to `$I32`, means that either an +;; i8x16 or an i16x8 is being extracted. These are implemented with +;; the `pextr{b,w}` instruction which automatically zero the upper bits of the +;; destination register so the `uextend` in these cases can be elided. +;; +;; TODO: the documentation for `pextr{b,w}` seems to indicate it zero extends +;; to not only 32-bits but probably the whole 64-bit register. If that's the +;; case then this should match a zero-extend to any size instead of just `$I32`. +;; +;; TODO: the interaction here between this rule and the "it's written far away" +;; rule to lower `extractlane` isn't great. Ideally this rule (and the other +;; special cases for `value32_zeros_upper32`) would live contextually closer or +;; be connected to the extractlane rules. There's some discussion of this on +;; #6022 but the gist is that there's not a lot of great options at this time, +;; so this doc block is what's here for now. +(rule 1 (extend_to_gpr src @ (extractlane _ _) $I32 (ExtendKind.Zero)) + src) (rule (extend_to_gpr (and val (value_type from_ty)) to_ty @@ -1732,6 +1756,21 @@ (rule (extend (ExtendKind.Sign) ty mode src) (x64_movsx mode src)) +;; Tests whether the operation used to produce the input `Value`, which must +;; be a 32-bit operation, will automatically zero the upper 32-bits of the +;; destination register that `Value` is placed in. +(decl pure value32_zeros_upper32 (Value) bool) +(rule (value32_zeros_upper32 (iadd _ _)) $true) +(rule (value32_zeros_upper32 (isub _ _)) $true) +(rule (value32_zeros_upper32 (imul _ _)) $true) +(rule (value32_zeros_upper32 (band _ _)) $true) +(rule (value32_zeros_upper32 (bor _ _)) $true) +(rule (value32_zeros_upper32 (bxor _ _)) $true) +(rule (value32_zeros_upper32 (ishl _ _)) $true) +(rule (value32_zeros_upper32 (ushr _ _)) $true) +(rule (value32_zeros_upper32 (uload32 _ _ _)) $true) +(rule -1 (value32_zeros_upper32 _) $false) + ;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Turn a vector type into its integer-typed vector equivalent. diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 99adfb7cfd..f42270f120 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2039,101 +2039,38 @@ ;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; T -> T is a no-op. -(rule 1 (lower (has_type ty (uextend src @ (value_type ty)))) - src) - -;; I64 -> I128. -(rule -1 (lower (has_type $I128 (uextend src @ (value_type $I64)))) - (value_regs src (imm $I64 0))) - -;; I{8,16,32} -> I128. -(rule (lower (has_type $I128 (uextend src @ (value_type (fits_in_32 src_ty))))) +;; I{8,16,32,64} -> I128. +(rule (lower (has_type $I128 (uextend src))) (value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0))) ;; I{8,16,32} -> I64. -(rule -1 (lower (has_type $I64 (uextend src @ (value_type (fits_in_32 src_ty))))) +(rule (lower (has_type $I64 (uextend src))) (extend_to_gpr src $I64 (ExtendKind.Zero))) -;; I8 -> I{16,32}, I16 -> I32. -(rule -2 (lower (has_type (fits_in_32 dst_ty) (uextend src @ (value_type (fits_in_32 src_ty))))) - (extend_to_gpr src $I32 (ExtendKind.Zero))) - -;; I32 -> I64 with op that produces a zero-extended value in a register. -;; -;; As a particular x64 extra-pattern matching opportunity, all the ALU -;; opcodes on 32-bits will zero-extend the upper 32-bits, so we can -;; even not generate a zero-extended move in this case. -;; -;; (Note that we unfortunately can't factor out the -;; insts-that-zero-upper-32 pattern into a separate extractor until we -;; can write internal extractors with multiple rules; and we'd rather -;; keep these here than write an external extractor containing bits of -;; the instruction pattern.s) -(rule (lower (has_type $I64 - (uextend src @ (has_type $I32 (iadd _ _))))) - src) -(rule (lower (has_type $I64 - (uextend src @ (has_type $I32 (isub _ _))))) - src) -(rule (lower (has_type $I64 - (uextend src @ (has_type $I32 (imul _ _))))) - src) -(rule (lower (has_type $I64 - (uextend src @ (has_type $I32 (band _ _))))) - src) -(rule (lower (has_type $I64 - (uextend src @ (has_type $I32 (bor _ _))))) - src) -(rule (lower (has_type $I64 - (uextend src @ (has_type $I32 (bxor _ _))))) - src) -(rule (lower (has_type $I64 - (uextend src @ (has_type $I32 (ishl _ _))))) - src) -(rule (lower (has_type $I64 - (uextend src @ (has_type $I32 (ushr _ _))))) - src) -(rule (lower (has_type $I64 - (uextend src @ (has_type $I32 (uload32 _ _ _))))) - src) +;; I{8,16} -> I32 +;; I8 -> I16 +(rule -1 (lower (has_type (fits_in_32 _) (uextend src))) + (extend_to_gpr src $I32 (ExtendKind.Zero))) ;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(decl generic_sextend (Value Type Type) InstOutput) - -;; T -> T is a no-op. -(rule 4 (generic_sextend src ty ty) - src) - +;; I{8,16,32} -> I128. +;; ;; Produce upper 64 bits sign-extended from lower 64: shift right by ;; 63 bits to spread the sign bit across the result. -(decl spread_sign_bit (Gpr) Gpr) -(rule (spread_sign_bit src) - (x64_sar $I64 src (Imm8Reg.Imm8 63))) - -;; I64 -> I128. -(rule 3 (generic_sextend src $I64 $I128) - (value_regs src (spread_sign_bit src))) - -;; I{8,16,32} -> I128. -(rule 2 (generic_sextend src (fits_in_32 src_ty) $I128) +(rule (lower (has_type $I128 (sextend src))) (let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign))) - (hi Gpr (spread_sign_bit lo))) + (hi Gpr (x64_sar $I64 lo (Imm8Reg.Imm8 63)))) (value_regs lo hi))) ;; I{8,16,32} -> I64. -(rule 1 (generic_sextend src (fits_in_32 src_ty) $I64) +(rule (lower (has_type $I64 (sextend src))) (extend_to_gpr src $I64 (ExtendKind.Sign))) -;; I8 -> I{16,32}, I16 -> I32. -(rule 0 (generic_sextend src (fits_in_32 src_ty) (fits_in_32 dst_ty)) - (extend_to_gpr src $I32 (ExtendKind.Sign))) - -(rule (lower - (has_type dst_ty - (sextend src @ (value_type src_ty)))) - (generic_sextend src src_ty dst_ty)) +;; I{8,16} -> I32 +;; I8 -> I16 +(rule -1 (lower (has_type (fits_in_32 _) (sextend src))) + (extend_to_gpr src $I32 (ExtendKind.Sign))) ;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3859,9 +3796,15 @@ ;; 0xee == 0b11_10_11_10 (x64_pshufd val 0xee)) +;; Note that the `pextrb` lowering here is relied upon by the `extend_to_gpr` +;; helper because it will elide a `uextend` operation when `extractlane` is the +;; inner node. The `pextrb` operation automatically zero-extends for us so the +;; extra `uextend` doesn't have to codegen anything. (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 8 16)) (u8_from_uimm8 lane))) (x64_pextrb val lane)) +;; See the note in the 8x16 case above for how this rule is connected to +;; `extend_to_gpr`. (rule 0 (lower (extractlane val @ (value_type ty @ (multi_lane 16 8)) (u8_from_uimm8 lane))) (x64_pextrw val lane)) diff --git a/cranelift/filetests/filetests/isa/x64/uextend-elision.clif b/cranelift/filetests/filetests/isa/x64/uextend-elision.clif index d4e306f50f..cc277ff602 100644 --- a/cranelift/filetests/filetests/isa/x64/uextend-elision.clif +++ b/cranelift/filetests/filetests/isa/x64/uextend-elision.clif @@ -27,3 +27,81 @@ block0(v0: i32, v1: i32): ; popq %rbp ; retq +function %extractlane_i8x16_i16(i8x16) -> i16 { +block0(v0: i8x16): + v1 = extractlane v0, 1 + v2 = uextend.i16 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrb $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pextrb $1, %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extractlane_i8x16_i32(i8x16) -> i32 { +block0(v0: i8x16): + v1 = extractlane v0, 1 + v2 = uextend.i32 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrb $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pextrb $1, %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extractlane_i16x8_i32(i16x8) -> i32 { +block0(v0: i16x8): + v1 = extractlane v0, 1 + v2 = uextend.i32 v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pextrw $1, %xmm0, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pextrw $1, %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; retq +