x64: Share a zero in the ushr translation on x64 to free up a register (#5424)

Share a zero value in the translation of ushr for i128. This increases the lifetime of the value by a few instructions, and reduces the number of registers used in the translation by one, which seems like an acceptable trade-off.
This commit is contained in:
Trevor Elliott
2022-12-12 18:15:43 -08:00
committed by GitHub
parent 9397ea1abe
commit a5ecb5e647
3 changed files with 77 additions and 85 deletions

View File

@@ -521,9 +521,12 @@
(x64_sub $I64 (x64_sub $I64
(imm $I64 64) (imm $I64 64)
amt))) amt)))
;; Share the zero value to reduce register pressure
(zero Gpr (imm $I64 0))
;; Nullify the carry if we are shifting by a multiple of 128. ;; Nullify the carry if we are shifting by a multiple of 128.
(carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) (RegMemImm.Imm 127) amt) (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) (RegMemImm.Imm 127) amt)
(cmove $I64 (CC.Z) (imm $I64 0) carry))) (cmove $I64 (CC.Z) zero carry)))
;; Add the carry bits into the lo. ;; Add the carry bits into the lo.
(lo_shifted_ Gpr (x64_or $I64 carry_ lo_shifted))) (lo_shifted_ Gpr (x64_or $I64 carry_ lo_shifted)))
;; Combine the two shifted halves. However, if we are shifting by >= 64 ;; Combine the two shifted halves. However, if we are shifting by >= 64
@@ -532,7 +535,7 @@
(with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 64) amt) (with_flags (x64_test (OperandSize.Size64) (RegMemImm.Imm 64) amt)
(consumes_flags_concat (consumes_flags_concat
(cmove $I64 (CC.Z) lo_shifted_ hi_shifted) (cmove $I64 (CC.Z) lo_shifted_ hi_shifted)
(cmove $I64 (CC.Z) hi_shifted (imm $I64 0)))))) (cmove $I64 (CC.Z) hi_shifted zero)))))
(rule (lower (has_type $I128 (ushr src amt))) (rule (lower (has_type $I128 (ushr src amt)))
;; NB: Only the low bits of `amt` matter since we logically mask the shift ;; NB: Only the low bits of `amt` matter since we logically mask the shift

View File

@@ -923,16 +923,15 @@ block0(v0: i128, v1: i128):
; movq %rsi, %r10 ; movq %rsi, %r10
; shrq %cl, %r10, %r10 ; shrq %cl, %r10, %r10
; movl $64, %ecx ; movl $64, %ecx
; movq %rdx, %rax ; movq %rdx, %rdi
; subq %rcx, %rax, %rcx ; subq %rcx, %rdi, %rcx
; movq %rsi, %r11 ; movq %rsi, %r11
; shlq %cl, %r11, %r11 ; shlq %cl, %r11, %r11
; xorq %r9, %r9, %r9
; testq $127, %rax
; cmovzq %r9, %r11, %r11
; orq %r11, %r8, %r11
; xorq %rdx, %rdx, %rdx ; xorq %rdx, %rdx, %rdx
; testq $64, %rax ; testq $127, %rdi
; cmovzq %rdx, %r11, %r11
; orq %r11, %r8, %r11
; testq $64, %rdi
; movq %r10, %rax ; movq %r10, %rax
; cmovzq %r11, %rax, %rax ; cmovzq %r11, %rax, %rax
; cmovzq %r10, %rdx, %rdx ; cmovzq %r10, %rdx, %rdx
@@ -1000,29 +999,28 @@ block0(v0: i128, v1: i128):
; cmovzq %rdx, %rax, %rax ; cmovzq %rdx, %rax, %rax
; cmovzq %r10, %rdx, %rdx ; cmovzq %r10, %rdx, %rdx
; movl $128, %ecx ; movl $128, %ecx
; movq %r8, %r11 ; movq %r8, %r10
; subq %rcx, %r11, %rcx ; subq %rcx, %r10, %rcx
; movq %rdi, %r8 ; movq %rdi, %r8
; shrq %cl, %r8, %r8 ; shrq %cl, %r8, %r8
; movq %rsi, %r9 ; movq %rsi, %r9
; shrq %cl, %r9, %r9 ; shrq %cl, %r9, %r9
; movq %rcx, %r10 ; movq %rcx, %r10
; movl $64, %ecx ; movl $64, %ecx
; movq %r10, %rdi ; movq %r10, %r11
; subq %rcx, %rdi, %rcx ; subq %rcx, %r11, %rcx
; movq %rsi, %r10 ; movq %rsi, %r10
; shlq %cl, %r10, %r10 ; shlq %cl, %r10, %r10
; xorq %r11, %r11, %r11 ; xorq %rsi, %rsi, %rsi
; testq $127, %rdi ; testq $127, %r11
; cmovzq %r11, %r10, %r10 ; cmovzq %rsi, %r10, %r10
; orq %r10, %r8, %r10 ; orq %r10, %r8, %r10
; xorq %r8, %r8, %r8 ; testq $64, %r11
; testq $64, %rdi ; movq %r9, %r8
; movq %r9, %r11 ; cmovzq %r10, %r8, %r8
; cmovzq %r10, %r11, %r11 ; cmovzq %r9, %rsi, %rsi
; cmovzq %r9, %r8, %r8 ; orq %rax, %r8, %rax
; orq %rax, %r11, %rax ; orq %rdx, %rsi, %rdx
; orq %rdx, %r8, %rdx
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret
@@ -1047,36 +1045,36 @@ block0(v0: i128, v1: i128):
; subq %rcx, %rax, %rcx ; subq %rcx, %rax, %rcx
; movq %rsi, %r11 ; movq %rsi, %r11
; shlq %cl, %r11, %r11 ; shlq %cl, %r11, %r11
; xorq %r9, %r9, %r9
; testq $127, %rax
; cmovzq %r9, %r11, %r11
; orq %r11, %r8, %r11
; xorq %rdx, %rdx, %rdx ; xorq %rdx, %rdx, %rdx
; testq $127, %rax
; cmovzq %rdx, %r11, %r11
; orq %r11, %r8, %r11
; testq $64, %rax ; testq $64, %rax
; movq %rax, %r9
; movq %r10, %rax ; movq %r10, %rax
; cmovzq %r11, %rax, %rax ; cmovzq %r11, %rax, %rax
; cmovzq %r10, %rdx, %rdx ; cmovzq %r10, %rdx, %rdx
; movl $128, %ecx ; movl $128, %ecx
; movq %r9, %r8 ; movq %r9, %r10
; subq %rcx, %r8, %rcx
; movq %rdi, %r9
; shlq %cl, %r9, %r9
; shlq %cl, %rsi, %rsi
; movq %rcx, %r10
; movl $64, %ecx
; subq %rcx, %r10, %rcx ; subq %rcx, %r10, %rcx
; movq %rdi, %r11 ; movq %rdi, %r8
; shrq %cl, %r11, %r11 ; shlq %cl, %r8, %r8
; xorq %rdi, %rdi, %rdi ; movq %rsi, %r10
; testq $127, %r10 ; shlq %cl, %r10, %r10
; cmovzq %rdi, %r11, %r11 ; movq %rcx, %r9
; orq %r11, %rsi, %r11 ; movl $64, %ecx
; testq $64, %r10 ; movq %r9, %rsi
; cmovzq %r9, %rdi, %rdi ; subq %rcx, %rsi, %rcx
; movq %rdi, %r9
; shrq %cl, %r9, %r9
; xorq %r11, %r11, %r11
; testq $127, %rsi
; cmovzq %r11, %r9, %r9 ; cmovzq %r11, %r9, %r9
; orq %rax, %rdi, %rax ; orq %r9, %r10, %r9
; orq %rdx, %r9, %rdx ; testq $64, %rsi
; cmovzq %r8, %r11, %r11
; cmovzq %r9, %r8, %r8
; orq %rax, %r11, %rax
; orq %rdx, %r8, %rdx
; movq %rbp, %rsp ; movq %rbp, %rsp
; popq %rbp ; popq %rbp
; ret ; ret

View File

@@ -20,18 +20,17 @@ block0(v0: i128, v1: i8):
; shrq %cl, %r8, %r8 ; shrq %cl, %r8, %r8
; movq %rsi, %r10 ; movq %rsi, %r10
; shrq %cl, %r10, %r10 ; shrq %cl, %r10, %r10
; movq %rcx, %r11 ; movq %rcx, %r9
; movl $64, %ecx ; movl $64, %ecx
; movq %r11, %rax ; movq %r9, %rdi
; subq %rcx, %rax, %rcx ; subq %rcx, %rdi, %rcx
; movq %rsi, %r11 ; movq %rsi, %r11
; shlq %cl, %r11, %r11 ; shlq %cl, %r11, %r11
; xorq %r9, %r9, %r9
; testq $127, %rax
; cmovzq %r9, %r11, %r11
; orq %r11, %r8, %r11
; xorq %rdx, %rdx, %rdx ; xorq %rdx, %rdx, %rdx
; testq $64, %rax ; testq $127, %rdi
; cmovzq %rdx, %r11, %r11
; orq %r11, %r8, %r11
; testq $64, %rdi
; movq %r10, %rax ; movq %r10, %rax
; cmovzq %r11, %rax, %rax ; cmovzq %r11, %rax, %rax
; cmovzq %r10, %rdx, %rdx ; cmovzq %r10, %rdx, %rdx
@@ -49,21 +48,19 @@ block0(v0: i128, v1: i64):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movq %rdx, %rcx ; movq %rdx, %rcx
; movq %rdi, %rdx ; movq %rdi, %r8
; shrq %cl, %rdx, %rdx ; shrq %cl, %r8, %r8
; movq %rsi, %r9 ; movq %rsi, %r9
; shrq %cl, %r9, %r9 ; shrq %cl, %r9, %r9
; movq %rcx, %r10
; movl $64, %ecx ; movl $64, %ecx
; movq %r10, %rdi ; movq %rdx, %rdi
; subq %rcx, %rdi, %rcx ; subq %rcx, %rdi, %rcx
; movq %rsi, %r10 ; movq %rsi, %r10
; shlq %cl, %r10, %r10 ; shlq %cl, %r10, %r10
; xorq %r8, %r8, %r8
; testq $127, %rdi
; cmovzq %r8, %r10, %r10
; orq %r10, %rdx, %r10
; xorq %rdx, %rdx, %rdx ; xorq %rdx, %rdx, %rdx
; testq $127, %rdi
; cmovzq %rdx, %r10, %r10
; orq %r10, %r8, %r10
; testq $64, %rdi ; testq $64, %rdi
; movq %r9, %rax ; movq %r9, %rax
; cmovzq %r10, %rax, %rax ; cmovzq %r10, %rax, %rax
@@ -82,21 +79,19 @@ block0(v0: i128, v1: i32):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movq %rdx, %rcx ; movq %rdx, %rcx
; movq %rdi, %rdx ; movq %rdi, %r8
; shrq %cl, %rdx, %rdx ; shrq %cl, %r8, %r8
; movq %rsi, %r9 ; movq %rsi, %r9
; shrq %cl, %r9, %r9 ; shrq %cl, %r9, %r9
; movq %rcx, %r10
; movl $64, %ecx ; movl $64, %ecx
; movq %r10, %rdi ; movq %rdx, %rdi
; subq %rcx, %rdi, %rcx ; subq %rcx, %rdi, %rcx
; movq %rsi, %r10 ; movq %rsi, %r10
; shlq %cl, %r10, %r10 ; shlq %cl, %r10, %r10
; xorq %r8, %r8, %r8
; testq $127, %rdi
; cmovzq %r8, %r10, %r10
; orq %r10, %rdx, %r10
; xorq %rdx, %rdx, %rdx ; xorq %rdx, %rdx, %rdx
; testq $127, %rdi
; cmovzq %rdx, %r10, %r10
; orq %r10, %r8, %r10
; testq $64, %rdi ; testq $64, %rdi
; movq %r9, %rax ; movq %r9, %rax
; cmovzq %r10, %rax, %rax ; cmovzq %r10, %rax, %rax
@@ -115,21 +110,19 @@ block0(v0: i128, v1: i16):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movq %rdx, %rcx ; movq %rdx, %rcx
; movq %rdi, %rdx ; movq %rdi, %r8
; shrq %cl, %rdx, %rdx ; shrq %cl, %r8, %r8
; movq %rsi, %r9 ; movq %rsi, %r9
; shrq %cl, %r9, %r9 ; shrq %cl, %r9, %r9
; movq %rcx, %r10
; movl $64, %ecx ; movl $64, %ecx
; movq %r10, %rdi ; movq %rdx, %rdi
; subq %rcx, %rdi, %rcx ; subq %rcx, %rdi, %rcx
; movq %rsi, %r10 ; movq %rsi, %r10
; shlq %cl, %r10, %r10 ; shlq %cl, %r10, %r10
; xorq %r8, %r8, %r8
; testq $127, %rdi
; cmovzq %r8, %r10, %r10
; orq %r10, %rdx, %r10
; xorq %rdx, %rdx, %rdx ; xorq %rdx, %rdx, %rdx
; testq $127, %rdi
; cmovzq %rdx, %r10, %r10
; orq %r10, %r8, %r10
; testq $64, %rdi ; testq $64, %rdi
; movq %r9, %rax ; movq %r9, %rax
; cmovzq %r10, %rax, %rax ; cmovzq %r10, %rax, %rax
@@ -148,21 +141,19 @@ block0(v0: i128, v1: i8):
; movq %rsp, %rbp ; movq %rsp, %rbp
; block0: ; block0:
; movq %rdx, %rcx ; movq %rdx, %rcx
; movq %rdi, %rdx ; movq %rdi, %r8
; shrq %cl, %rdx, %rdx ; shrq %cl, %r8, %r8
; movq %rsi, %r9 ; movq %rsi, %r9
; shrq %cl, %r9, %r9 ; shrq %cl, %r9, %r9
; movq %rcx, %r10
; movl $64, %ecx ; movl $64, %ecx
; movq %r10, %rdi ; movq %rdx, %rdi
; subq %rcx, %rdi, %rcx ; subq %rcx, %rdi, %rcx
; movq %rsi, %r10 ; movq %rsi, %r10
; shlq %cl, %r10, %r10 ; shlq %cl, %r10, %r10
; xorq %r8, %r8, %r8
; testq $127, %rdi
; cmovzq %r8, %r10, %r10
; orq %r10, %rdx, %r10
; xorq %rdx, %rdx, %rdx ; xorq %rdx, %rdx, %rdx
; testq $127, %rdi
; cmovzq %rdx, %r10, %r10
; orq %r10, %r8, %r10
; testq $64, %rdi ; testq $64, %rdi
; movq %r9, %rax ; movq %r9, %rax
; cmovzq %r10, %rax, %rax ; cmovzq %r10, %rax, %rax