aarch64: Specialize constant vector shifts (#5976)

* aarch64: Specialize constant vector shifts

This commit adds special lowering rules for
vector-shifts-by-constant-amounts to use dedicated instructions which
cuts down on the codegen here quite a bit for constant values.

* Fix codegen for 0-shift-rights

* Special-case zero left-shifts as well

* Remove left-shift special case
This commit is contained in:
Alex Crichton
2023-03-13 17:37:59 -05:00
committed by GitHub
parent 90c9bec225
commit d6ce632b5b
7 changed files with 554 additions and 45 deletions

View File

@@ -561,20 +561,12 @@ block0(v0: i8x16):
; VCode:
; block0:
; movz x2, #1
; and w4, w2, #7
; sub x6, xzr, x4
; dup v16.16b, w6
; ushl v0.16b, v0.16b, v16.16b
; ushr v0.16b, v0.16b, #1
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov x2, #1
; and w4, w2, #7
; neg x6, x4
; dup v16.16b, w6
; ushl v0.16b, v0.16b, v16.16b
; ushr v0.16b, v0.16b, #1
; ret
function %add_i128(i128, i128) -> i128 {

View File

@@ -128,3 +128,395 @@ block0(v0: i64x2, v1: i64x2):
; add v0.2d, v17.2d, v23.2d
; ret
function %ishl_i8x16_imm(i8x16) -> i8x16 {
block0(v0: i8x16):
v1 = iconst.i32 1
v2 = ishl v0, v1
return v2
}
; VCode:
; block0:
; shl v0.16b, v0.16b, #1
; ret
;
; Disassembled:
; block0: ; offset 0x0
; shl v0.16b, v0.16b, #1
; ret
function %ishl_i16x8_imm(i16x8) -> i16x8 {
block0(v0: i16x8):
v1 = iconst.i32 15
v2 = ishl v0, v1
return v2
}
; VCode:
; block0:
; shl v0.8h, v0.8h, #15
; ret
;
; Disassembled:
; block0: ; offset 0x0
; shl v0.8h, v0.8h, #0xf
; ret
function %ishl_i32x4_imm(i32x4) -> i32x4 {
block0(v0: i32x4):
v1 = iconst.i32 22
v2 = ishl v0, v1
return v2
}
; VCode:
; block0:
; shl v0.4s, v0.4s, #22
; ret
;
; Disassembled:
; block0: ; offset 0x0
; shl v0.4s, v0.4s, #0x16
; ret
function %ishl_i64x2_imm(i64x2) -> i64x2 {
block0(v0: i64x2):
v1 = iconst.i32 55
v2 = ishl v0, v1
return v2
}
; VCode:
; block0:
; shl v0.2d, v0.2d, #55
; ret
;
; Disassembled:
; block0: ; offset 0x0
; shl v0.2d, v0.2d, #0x37
; ret
function %sshr_i8x16_imm(i8x16) -> i8x16 {
block0(v0: i8x16):
v1 = iconst.i32 1
v2 = sshr v0, v1
return v2
}
; VCode:
; block0:
; sshr v0.16b, v0.16b, #1
; ret
;
; Disassembled:
; block0: ; offset 0x0
; sshr v0.16b, v0.16b, #1
; ret
function %sshr_i16x8_imm(i16x8) -> i16x8 {
block0(v0: i16x8):
v1 = iconst.i32 15
v2 = sshr v0, v1
return v2
}
; VCode:
; block0:
; sshr v0.8h, v0.8h, #15
; ret
;
; Disassembled:
; block0: ; offset 0x0
; sshr v0.8h, v0.8h, #0xf
; ret
function %sshr_i32x4_imm(i32x4) -> i32x4 {
block0(v0: i32x4):
v1 = iconst.i32 22
v2 = sshr v0, v1
return v2
}
; VCode:
; block0:
; sshr v0.4s, v0.4s, #22
; ret
;
; Disassembled:
; block0: ; offset 0x0
; sshr v0.4s, v0.4s, #0x16
; ret
function %sshr_i64x2_imm(i64x2) -> i64x2 {
block0(v0: i64x2):
v1 = iconst.i32 55
v2 = sshr v0, v1
return v2
}
; VCode:
; block0:
; sshr v0.2d, v0.2d, #55
; ret
;
; Disassembled:
; block0: ; offset 0x0
; sshr v0.2d, v0.2d, #0x37
; ret
function %ushr_i8x16_imm(i8x16) -> i8x16 {
block0(v0: i8x16):
v1 = iconst.i32 1
v2 = ushr v0, v1
return v2
}
; VCode:
; block0:
; ushr v0.16b, v0.16b, #1
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ushr v0.16b, v0.16b, #1
; ret
function %ushr_i16x8_imm(i16x8) -> i16x8 {
block0(v0: i16x8):
v1 = iconst.i32 15
v2 = ushr v0, v1
return v2
}
; VCode:
; block0:
; ushr v0.8h, v0.8h, #15
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ushr v0.8h, v0.8h, #0xf
; ret
function %ushr_i32x4_imm(i32x4) -> i32x4 {
block0(v0: i32x4):
v1 = iconst.i32 22
v2 = ushr v0, v1
return v2
}
; VCode:
; block0:
; ushr v0.4s, v0.4s, #22
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ushr v0.4s, v0.4s, #0x16
; ret
function %ushr_i64x2_imm(i64x2) -> i64x2 {
block0(v0: i64x2):
v1 = iconst.i32 55
v2 = ushr v0, v1
return v2
}
; VCode:
; block0:
; ushr v0.2d, v0.2d, #55
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ushr v0.2d, v0.2d, #0x37
; ret
function %ishl_i8x16_full_width(i8x16) -> i8x16 {
block0(v0: i8x16):
v1 = iconst.i32 8
v2 = ishl v0, v1
return v2
}
; VCode:
; block0:
; shl v0.16b, v0.16b, #0
; ret
;
; Disassembled:
; block0: ; offset 0x0
; shl v0.16b, v0.16b, #0
; ret
function %ishl_i16x8_full_width(i16x8) -> i16x8 {
block0(v0: i16x8):
v1 = iconst.i32 16
v2 = ishl v0, v1
return v2
}
; VCode:
; block0:
; shl v0.8h, v0.8h, #0
; ret
;
; Disassembled:
; block0: ; offset 0x0
; shl v0.8h, v0.8h, #0
; ret
function %ishl_i32x4_full_width(i32x4) -> i32x4 {
block0(v0: i32x4):
v1 = iconst.i32 32
v2 = ishl v0, v1
return v2
}
; VCode:
; block0:
; shl v0.4s, v0.4s, #0
; ret
;
; Disassembled:
; block0: ; offset 0x0
; shl v0.4s, v0.4s, #0
; ret
function %ishl_i64x2_full_width(i64x2) -> i64x2 {
block0(v0: i64x2):
v1 = iconst.i32 64
v2 = ishl v0, v1
return v2
}
; VCode:
; block0:
; shl v0.2d, v0.2d, #0
; ret
;
; Disassembled:
; block0: ; offset 0x0
; shl v0.2d, v0.2d, #0
; ret
function %sshr_i8x16_full_width(i8x16) -> i8x16 {
block0(v0: i8x16):
v1 = iconst.i32 8
v2 = sshr v0, v1
return v2
}
; VCode:
; block0:
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ret
function %sshr_i16x8_full_width(i16x8) -> i16x8 {
block0(v0: i16x8):
v1 = iconst.i32 16
v2 = sshr v0, v1
return v2
}
; VCode:
; block0:
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ret
function %sshr_i32x4_full_width(i32x4) -> i32x4 {
block0(v0: i32x4):
v1 = iconst.i32 32
v2 = sshr v0, v1
return v2
}
; VCode:
; block0:
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ret
function %sshr_i64x2_full_width(i64x2) -> i64x2 {
block0(v0: i64x2):
v1 = iconst.i32 64
v2 = sshr v0, v1
return v2
}
; VCode:
; block0:
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ret
function %ushr_i8x16_full_width(i8x16) -> i8x16 {
block0(v0: i8x16):
v1 = iconst.i32 8
v2 = ushr v0, v1
return v2
}
; VCode:
; block0:
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ret
function %ushr_i16x8_full_width(i16x8) -> i16x8 {
block0(v0: i16x8):
v1 = iconst.i32 16
v2 = ushr v0, v1
return v2
}
; VCode:
; block0:
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ret
function %ushr_i32x4_full_width(i32x4) -> i32x4 {
block0(v0: i32x4):
v1 = iconst.i32 32
v2 = ushr v0, v1
return v2
}
; VCode:
; block0:
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ret
function %ushr_i64x2_full_width(i64x2) -> i64x2 {
block0(v0: i64x2):
v1 = iconst.i32 64
v2 = ushr v0, v1
return v2
}
; VCode:
; block0:
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ret

View File

@@ -259,22 +259,14 @@ block0:
; VCode:
; block0:
; ldr q5, [const(0)]
; movz w1, #1
; and w3, w1, #7
; sub x5, xzr, x3
; dup v7.16b, w5
; ushl v0.16b, v5.16b, v7.16b
; ldr q1, [const(0)]
; ushr v0.16b, v1.16b, #1
; ret
;
; Disassembled:
; block0: ; offset 0x0
; ldr q5, #0x20
; mov w1, #1
; and w3, w1, #7
; neg x5, x3
; dup v7.16b, w5
; ushl v0.16b, v5.16b, v7.16b
; ldr q1, #0x10
; ushr v0.16b, v1.16b, #1
; ret
; .byte 0x00, 0x00, 0x00, 0x00
; .byte 0x00, 0x01, 0x02, 0x03
@@ -321,20 +313,12 @@ block0(v0: i8x16, v1: i32):
; VCode:
; block0:
; movz w3, #3
; and w5, w3, #7
; sub x7, xzr, x5
; dup v17.16b, w7
; sshl v0.16b, v0.16b, v17.16b
; sshr v0.16b, v0.16b, #3
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov w3, #3
; and w5, w3, #7
; neg x7, x5
; dup v17.16b, w7
; sshl v0.16b, v0.16b, v17.16b
; sshr v0.16b, v0.16b, #3
; ret
function %sshr_i64x2(i64x2, i32) -> i64x2 {

View File

@@ -195,3 +195,108 @@ block0(v0: i32x4):
return v1
}
; run: %iabs([-42 -1 0 1]) == [42 1 0 1]
function %i8x16_shl_imm(i8x16) -> i8x16 {
block0(v0: i8x16):
v1 = iconst.i32 2
v2 = ishl v0, v1
return v2
}
; run: %i8x16_shl_imm([0x01 0x02 0x04 0x08 0x10 0x20 0x40 0x80 0 0 0 0 0 0 0 0]) == [0x04 0x08 0x10 0x20 0x40 0x80 0 0 0 0 0 0 0 0 0 0]
function %i16x8_shl_imm(i16x8) -> i16x8 {
block0(v0: i16x8):
v1 = iconst.i32 4
v2 = ishl v0, v1
return v2
}
; run: %i16x8_shl_imm([0x0001 0x0002 0x0004 0x0008 0x0010 0x0020 0x0040 0x0080]) == [0x0010 0x0020 0x0040 0x0080 0x0100 0x0200 0x0400 0x0800]
; run: %i16x8_shl_imm([0x0100 0x0200 0x0400 0x0800 0x1000 0x2000 0x4000 0x8000]) == [0x1000 0x2000 0x4000 0x8000 0 0 0 0]
function %i32x4_shl_imm(i32x4) -> i32x4 {
block0(v0: i32x4):
v1 = iconst.i32 4
v2 = ishl v0, v1
return v2
}
; run: %i32x4_shl_imm([0x00000001 0x00000002 0x00000004 0x00000008]) == [0x00000010 0x00000020 0x00000040 0x00000080]
; run: %i32x4_shl_imm([0x10000000 0x00010000 0xf0000000 0x02000000]) == [0 0x00100000 0 0x20000000]
function %i64x2_shl_imm(i64x2) -> i64x2 {
block0(v0: i64x2):
v1 = iconst.i32 32
v2 = ishl v0, v1
return v2
}
; run: %i64x2_shl_imm([0x1 0xf]) == [0x100000000 0xf00000000]
; run: %i64x2_shl_imm([0x100000000 0]) == [0 0]
function %i8x16_sshr_imm(i8x16) -> i8x16 {
block0(v0: i8x16):
v1 = iconst.i32 2
v2 = sshr v0, v1
return v2
}
; run: %i8x16_shl_imm([0x01 0x02 0x04 0x08 0x10 0x20 0x40 0x80 0 0 0 0 0 0 0 0]) == [0 0 0x01 0x02 0x04 0x08 0x10 0xe0 0 0 0 0 0 0 0 0]
function %i16x8_sshr_imm(i16x8) -> i16x8 {
block0(v0: i16x8):
v1 = iconst.i32 4
v2 = sshr v0, v1
return v2
}
; run: %i16x8_sshr_imm([0x0001 0x0002 0x0004 0x0008 0x0010 0x0020 0x0040 0x0080]) == [0 0 0 0 0x1 0x2 0x4 0x8]
; run: %i16x8_sshr_imm([-1 -2 -4 -8 -16 16 0x8000 0x80f3]) == [-1 -1 -1 -1 -1 1 0xf800 0xf80f]
function %i32x4_sshr_imm(i32x4) -> i32x4 {
block0(v0: i32x4):
v1 = iconst.i32 4
v2 = sshr v0, v1
return v2
}
; run: %i32x4_sshr_imm([1 0xfc 0x80000000 0xf83f3000]) == [0 0xf 0xf8000000 0xff83f300]
function %i64x2_sshr_imm(i64x2) -> i64x2 {
block0(v0: i64x2):
v1 = iconst.i32 32
v2 = sshr v0, v1
return v2
}
; run: %i64x2_sshr_imm([0x1 0xf]) == [0 0]
; run: %i64x2_sshr_imm([0x100000000 0]) == [1 0]
; run: %i64x2_sshr_imm([-1 -1]) == [-1 -1]
function %i8x16_ushr_imm(i8x16) -> i8x16 {
block0(v0: i8x16):
v1 = iconst.i32 2
v2 = ushr v0, v1
return v2
}
; run: %i8x16_shl_imm([0x01 0x02 0x04 0x08 0x10 0x20 0x40 0x80 0 0 0 0 0 0 0 0]) == [0 0 0x01 0x02 0x04 0x08 0x10 0x20 0 0 0 0 0 0 0 0]
function %i16x8_ushr_imm(i16x8) -> i16x8 {
block0(v0: i16x8):
v1 = iconst.i32 4
v2 = ushr v0, v1
return v2
}
; run: %i16x8_ushr_imm([0x0001 0x0002 0x0004 0x0008 0x0010 0x0020 0x0040 0x0080]) == [0 0 0 0 0x1 0x2 0x4 0x8]
; run: %i16x8_ushr_imm([-1 -2 -4 -8 -16 16 0x8000 0x80f3]) == [0x0fff 0x0fff 0x0fff 0x0fff 0x0fff 1 0x0800 0x080f]
function %i32x4_ushr_imm(i32x4) -> i32x4 {
block0(v0: i32x4):
v1 = iconst.i32 4
v2 = ushr v0, v1
return v2
}
; run: %i32x4_ushr_imm([1 0xfc 0x80000000 0xf83f3000]) == [0 0xf 0x08000000 0x0f83f300]
function %i64x2_ushr_imm(i64x2) -> i64x2 {
block0(v0: i64x2):
v1 = iconst.i32 32
v2 = ushr v0, v1
return v2
}
; run: %i64x2_ushr_imm([0x1 0xf]) == [0 0]
; run: %i64x2_ushr_imm([0x100000000 0]) == [1 0]
; run: %i64x2_ushr_imm([-1 -1]) == [0xffffffff 0xffffffff]