From 7ebff82861b28102da7354ec9faaa72d957a4efd Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Mon, 17 Apr 2023 13:48:08 -0500
Subject: [PATCH] Optimize sign extension via shifts (#6220)

* Optimize sign extension via shifts

This commit adds egraph optimization patterns for left-shifting a value
and then right-shifting it as a form of sign extending its lower bits.
This matches the behavior of the WebAssembly `i32.extend8_s`
instruction, for example. Note that the lowering of that WebAssembly
instruction does not use shifts, but historical versions of LLVM that
didn't support the instruction, or versions with the instruction
disabled, will use shifts instead.

A second rule for reduction-of-extend being the same as the original
value was added to keep an existing shift-related test passing as well.

* Add reference assemblies for new opts
---
 cranelift/codegen/src/opts/extends.isle       |   5 +
 cranelift/codegen/src/opts/shifts.isle        |  19 ++
 .../filetests/filetests/egraph/extends.clif   |  18 ++
 .../filetests/filetests/egraph/shifts.clif    | 100 +++++++
 .../filetests/isa/x64/shift-to-extend.clif    | 265 ++++++++++++++++++
 5 files changed, 407 insertions(+)
 create mode 100644 cranelift/filetests/filetests/isa/x64/shift-to-extend.clif

diff --git a/cranelift/codegen/src/opts/extends.isle b/cranelift/codegen/src/opts/extends.isle
index a64fba1b6c..a1a4dfda05 100644
--- a/cranelift/codegen/src/opts/extends.isle
+++ b/cranelift/codegen/src/opts/extends.isle
@@ -27,3 +27,8 @@
              (uextend $I64 x @ (value_type $I32))
              (iconst _ (u64_from_imm64 0))))
       (iconst ty (imm64 1)))
+
+;; A reduction-of-an-extend back to the same original type is the same as not
+;; actually doing the extend in the first place.
+(rule (simplify (ireduce ty (sextend _ x @ (value_type ty)))) x)
+(rule (simplify (ireduce ty (uextend _ x @ (value_type ty)))) x)
diff --git a/cranelift/codegen/src/opts/shifts.isle b/cranelift/codegen/src/opts/shifts.isle
index 44c79a6f8d..8dcf153d1a 100644
--- a/cranelift/codegen/src/opts/shifts.isle
+++ b/cranelift/codegen/src/opts/shifts.isle
@@ -79,6 +79,25 @@
       (if-let $true (u64_le shift_u64 (u64_sub (ty_bits_u64 wide) (ty_bits_u64 narrow))))
       x)
 
+;; (x << N) >> N == x as T_SMALL as T_LARGE
+;; if N == bytesizeof(T_LARGE) - bytesizeof(T_SMALL)
+;;
+;; Note that the shift is required to be >0 to ensure this doesn't accidentally
+;; try to `ireduce` a type to itself, which isn't a valid use of `ireduce`.
+(rule (simplify (sshr ty (ishl ty x (iconst _ shift)) (iconst _ shift)))
+      (if-let (u64_from_imm64 (u64_nonzero shift_u64)) shift)
+      (if-let ty_small (shift_amt_to_type (u64_sub (ty_bits ty) shift_u64)))
+      (sextend ty (ireduce ty_small x)))
+(rule (simplify (ushr ty (ishl ty x (iconst _ shift)) (iconst _ shift)))
+      (if-let (u64_from_imm64 (u64_nonzero shift_u64)) shift)
+      (if-let ty_small (shift_amt_to_type (u64_sub (ty_bits ty) shift_u64)))
+      (uextend ty (ireduce ty_small x)))
+
+(decl pure partial shift_amt_to_type (u64) Type)
+(rule (shift_amt_to_type 8) $I8)
+(rule (shift_amt_to_type 16) $I16)
+(rule (shift_amt_to_type 32) $I32)
+
 ;; ineg(ushr(x, k)) == sshr(x, k) when k == ty_bits - 1.
 (rule (simplify (ineg ty (ushr ty x sconst @ (iconst ty (u64_from_imm64 shift_amt)))))
       (if-let $true (u64_eq shift_amt (u64_sub (ty_bits ty) 1)))
diff --git a/cranelift/filetests/filetests/egraph/extends.clif b/cranelift/filetests/filetests/egraph/extends.clif
index bfc9876044..03454ca6d0 100644
--- a/cranelift/filetests/filetests/egraph/extends.clif
+++ b/cranelift/filetests/filetests/egraph/extends.clif
@@ -53,3 +53,21 @@ block0(v1: i16):
 
 ; check: v4 = sextend.i64 v1
 ; check: return v4
+
+function %sextend_then_reduce(i16) -> i16 {
+block0(v1: i16):
+    v2 = sextend.i32 v1
+    v3 = ireduce.i16 v2
+    return v3
+}
+
+; check: return v1
+
+function %uextend_then_reduce(i32) -> i32 {
+block0(v1: i32):
+    v2 = uextend.i64 v1
+    v3 = ireduce.i32 v2
+    return v3
+}
+
+; check: return v1
diff --git a/cranelift/filetests/filetests/egraph/shifts.clif b/cranelift/filetests/filetests/egraph/shifts.clif
index d9c9da277d..b8028c50e6 100644
--- a/cranelift/filetests/filetests/egraph/shifts.clif
+++ b/cranelift/filetests/filetests/egraph/shifts.clif
@@ -215,3 +215,103 @@ block0(v0: i64):
     ; check: v4 = sshr v0, v1
     ; check: return v4
 }
+
+function %i32_shl_sshr_8_to_ireduce(i32) -> i32 {
+block0(v0: i32):
+    v1 = ishl_imm v0, 24
+    v2 = sshr_imm v1, 24
+    return v2
+    ; check: v5 = ireduce.i8 v0
+    ; check: v6 = sextend.i32 v5
+    ; check: return v6
+}
+
+function %i32_shl_sshr_16_to_ireduce(i32) -> i32 {
+block0(v0: i32):
+    v1 = ishl_imm v0, 16
+    v2 = sshr_imm v1, 16
+    return v2
+    ; check: v5 = ireduce.i16 v0
+    ; check: v6 = sextend.i32 v5
+    ; check: return v6
+}
+
+function %i64_shl_sshr_8_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 56
+    v2 = sshr_imm v1, 56
+    return v2
+    ; check: v5 = ireduce.i8 v0
+    ; check: v6 = sextend.i64 v5
+    ; check: return v6
+}
+
+function %i64_shl_sshr_16_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 48
+    v2 = sshr_imm v1, 48
+    return v2
+    ; check: v5 = ireduce.i16 v0
+    ; check: v6 = sextend.i64 v5
+    ; check: return v6
+}
+
+function %i64_shl_sshr_32_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 32
+    v2 = sshr_imm v1, 32
+    return v2
+    ; check: v5 = ireduce.i32 v0
+    ; check: v6 = sextend.i64 v5
+    ; check: return v6
+}
+
+function %i32_shl_ushr_8_to_ireduce(i32) -> i32 {
+block0(v0: i32):
+    v1 = ishl_imm v0, 24
+    v2 = ushr_imm v1, 24
+    return v2
+    ; check: v7 = ireduce.i8 v0
+    ; check: v8 = uextend.i32 v7
+    ; check: return v8
+}
+
+function %i32_shl_ushr_16_to_ireduce(i32) -> i32 {
+block0(v0: i32):
+    v1 = ishl_imm v0, 16
+    v2 = ushr_imm v1, 16
+    return v2
+    ; check: v7 = ireduce.i16 v0
+    ; check: v8 = uextend.i32 v7
+    ; check: return v8
+}
+
+function %i64_shl_ushr_8_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 56
+    v2 = ushr_imm v1, 56
+    return v2
+    ; check: v7 = ireduce.i8 v0
+    ; check: v8 = uextend.i64 v7
+    ; check: return v8
+}
+
+function %i64_shl_ushr_16_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 48
+    v2 = ushr_imm v1, 48
+    return v2
+    ; check: v7 = ireduce.i16 v0
+    ; check: v8 = uextend.i64 v7
+    ; check: return v8
+}
+
+function %i64_shl_ushr_32_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 32
+    v2 = ushr_imm v1, 32
+    return v2
+    ; check: v7 = ireduce.i32 v0
+    ; check: v8 = uextend.i64 v7
+    ; check: return v8
+}
diff --git a/cranelift/filetests/filetests/isa/x64/shift-to-extend.clif b/cranelift/filetests/filetests/isa/x64/shift-to-extend.clif
new file mode 100644
index 0000000000..219fe75a78
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/shift-to-extend.clif
@@ -0,0 +1,265 @@
+test compile precise-output
+set opt_level=speed
+target x86_64
+
+
+function %i32_shl_sshr_8_to_ireduce(i32) -> i32 {
+block0(v0: i32):
+    v1 = ishl_imm v0, 24
+    v2 = sshr_imm v1, 24
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movsbl  %dil, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movsbl %dil, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i32_shl_sshr_16_to_ireduce(i32) -> i32 {
+block0(v0: i32):
+    v1 = ishl_imm v0, 16
+    v2 = sshr_imm v1, 16
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movswl  %di, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movswl %di, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64_shl_sshr_8_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 56
+    v2 = sshr_imm v1, 56
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movsbq  %dil, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movsbq %dil, %rax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64_shl_sshr_16_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 48
+    v2 = sshr_imm v1, 48
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movswq  %di, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movswq %di, %rax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64_shl_sshr_32_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 32
+    v2 = sshr_imm v1, 32
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movslq  %edi, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movslq %edi, %rax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i32_shl_ushr_8_to_ireduce(i32) -> i32 {
+block0(v0: i32):
+    v1 = ishl_imm v0, 24
+    v2 = ushr_imm v1, 24
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movzbl  %dil, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movzbl %dil, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i32_shl_ushr_16_to_ireduce(i32) -> i32 {
+block0(v0: i32):
+    v1 = ishl_imm v0, 16
+    v2 = ushr_imm v1, 16
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movzwl  %di, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movzwl %di, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64_shl_ushr_8_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 56
+    v2 = ushr_imm v1, 56
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movzbq  %dil, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movzbq %dil, %rax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64_shl_ushr_16_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 48
+    v2 = ushr_imm v1, 48
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movzwq  %di, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movzwq %di, %rax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %i64_shl_ushr_32_to_ireduce(i64) -> i64 {
+block0(v0: i64):
+    v1 = ishl_imm v0, 32
+    v2 = ushr_imm v1, 32
+    return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movl    %edi, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   movl %edi, %eax
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+