aarch64: Specialize constant vector shifts (#5976)

* aarch64: Specialize constant vector shifts This commit adds special lowering rules for vector-shifts-by-constant-amounts to use dedicated instructions which cuts down on the codegen here quite a bit for constant values. * Fix codegen for 0-shift-rights * Special-case zero left-shifts as well * Remove left-shift special case
2023-03-13 17:37:59 -05:00
parent 90c9bec225
commit d6ce632b5b
7 changed files with 554 additions and 45 deletions
--- a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
@@ -561,20 +561,12 @@ block0(v0: i8x16):

 ; VCode:
 ; block0:
-;   movz x2, #1
-;   and w4, w2, #7
-;   sub x6, xzr, x4
-;   dup v16.16b, w6
-;   ushl v0.16b, v0.16b, v16.16b
+;   ushr v0.16b, v0.16b, #1
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
-;   mov x2, #1
-;   and w4, w2, #7
-;   neg x6, x4
-;   dup v16.16b, w6
-;   ushl v0.16b, v0.16b, v16.16b
+;   ushr v0.16b, v0.16b, #1
 ;   ret

 function %add_i128(i128, i128) -> i128 {
--- a/cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif
@@ -128,3 +128,395 @@ block0(v0: i64x2, v1: i64x2):
 ;   add v0.2d, v17.2d, v23.2d
 ;   ret

+function %ishl_i8x16_imm(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i32 1
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   shl v0.16b, v0.16b, #1
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   shl v0.16b, v0.16b, #1
+;   ret
+
+function %ishl_i16x8_imm(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i32 15
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   shl v0.8h, v0.8h, #15
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   shl v0.8h, v0.8h, #0xf
+;   ret
+
+function %ishl_i32x4_imm(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 22
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   shl v0.4s, v0.4s, #22
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   shl v0.4s, v0.4s, #0x16
+;   ret
+
+function %ishl_i64x2_imm(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i32 55
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   shl v0.2d, v0.2d, #55
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   shl v0.2d, v0.2d, #0x37
+;   ret
+
+function %sshr_i8x16_imm(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i32 1
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   sshr v0.16b, v0.16b, #1
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   sshr v0.16b, v0.16b, #1
+;   ret
+
+function %sshr_i16x8_imm(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i32 15
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   sshr v0.8h, v0.8h, #15
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   sshr v0.8h, v0.8h, #0xf
+;   ret
+
+function %sshr_i32x4_imm(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 22
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   sshr v0.4s, v0.4s, #22
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   sshr v0.4s, v0.4s, #0x16
+;   ret
+
+function %sshr_i64x2_imm(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i32 55
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   sshr v0.2d, v0.2d, #55
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   sshr v0.2d, v0.2d, #0x37
+;   ret
+
+function %ushr_i8x16_imm(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i32 1
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ushr v0.16b, v0.16b, #1
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ushr v0.16b, v0.16b, #1
+;   ret
+
+function %ushr_i16x8_imm(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i32 15
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ushr v0.8h, v0.8h, #15
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ushr v0.8h, v0.8h, #0xf
+;   ret
+
+function %ushr_i32x4_imm(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 22
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ushr v0.4s, v0.4s, #22
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ushr v0.4s, v0.4s, #0x16
+;   ret
+
+function %ushr_i64x2_imm(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i32 55
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ushr v0.2d, v0.2d, #55
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ushr v0.2d, v0.2d, #0x37
+;   ret
+
+function %ishl_i8x16_full_width(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i32 8
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   shl v0.16b, v0.16b, #0
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   shl v0.16b, v0.16b, #0
+;   ret
+
+function %ishl_i16x8_full_width(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i32 16
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   shl v0.8h, v0.8h, #0
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   shl v0.8h, v0.8h, #0
+;   ret
+
+function %ishl_i32x4_full_width(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 32
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   shl v0.4s, v0.4s, #0
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   shl v0.4s, v0.4s, #0
+;   ret
+
+function %ishl_i64x2_full_width(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i32 64
+    v2 = ishl v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   shl v0.2d, v0.2d, #0
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   shl v0.2d, v0.2d, #0
+;   ret
+
+function %sshr_i8x16_full_width(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i32 8
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ret
+
+function %sshr_i16x8_full_width(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i32 16
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ret
+
+function %sshr_i32x4_full_width(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 32
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ret
+
+function %sshr_i64x2_full_width(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i32 64
+    v2 = sshr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ret
+
+function %ushr_i8x16_full_width(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i32 8
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ret
+
+function %ushr_i16x8_full_width(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i32 16
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ret
+
+function %ushr_i32x4_full_width(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 32
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ret
+
+function %ushr_i64x2_full_width(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i32 64
+    v2 = ushr v0, v1
+    return v2
+}
+
+; VCode:
+; block0:
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   ret
+
--- a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif
@@ -259,22 +259,14 @@ block0:

 ; VCode:
 ; block0:
-;   ldr q5, [const(0)]
-;   movz w1, #1
-;   and w3, w1, #7
-;   sub x5, xzr, x3
-;   dup v7.16b, w5
-;   ushl v0.16b, v5.16b, v7.16b
+;   ldr q1, [const(0)]
+;   ushr v0.16b, v1.16b, #1
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
-;   ldr q5, #0x20
-;   mov w1, #1
-;   and w3, w1, #7
-;   neg x5, x3
-;   dup v7.16b, w5
-;   ushl v0.16b, v5.16b, v7.16b
+;   ldr q1, #0x10
+;   ushr v0.16b, v1.16b, #1
 ;   ret
 ;   .byte 0x00, 0x00, 0x00, 0x00
 ;   .byte 0x00, 0x01, 0x02, 0x03
@@ -321,20 +313,12 @@ block0(v0: i8x16, v1: i32):

 ; VCode:
 ; block0:
-;   movz w3, #3
-;   and w5, w3, #7
-;   sub x7, xzr, x5
-;   dup v17.16b, w7
-;   sshl v0.16b, v0.16b, v17.16b
+;   sshr v0.16b, v0.16b, #3
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
-;   mov w3, #3
-;   and w5, w3, #7
-;   neg x7, x5
-;   dup v17.16b, w7
-;   sshl v0.16b, v0.16b, v17.16b
+;   sshr v0.16b, v0.16b, #3
 ;   ret

 function %sshr_i64x2(i64x2, i32) -> i64x2 {