[RFC] Dynamic Vector Support (#4200)

Introduce a new concept in the IR that allows a producer to create
dynamic vector types. An IR function can now contain global value(s)
that represent a dynamic scaling factor, for a given fixed-width
vector type. A dynamic type is then created by 'multiplying' the
corresponding global value with a fixed-width type. These new types
can be used just like the existing types and the type system has a
set of hard-coded dynamic types, such as I32X4XN, which the user
defined types map onto. The dynamic types are also used explicitly
to create dynamic stack slots, which have no set size like their
existing counterparts. New IR instructions are added to access these
new stack entities.

Currently, during codegen, the dynamic scaling factor has to be
lowered to a constant so the dynamic slots do eventually have a
compile-time known size, as do spill slots.

The current lowering for aarch64 just targets Neon, using a dynamic
scale of 1.

Copyright (c) 2022, Arm Limited.
This commit is contained in:
Sam Parker
2022-07-07 20:54:39 +01:00
committed by GitHub
parent 9ae060a12a
commit 9c43749dfe
69 changed files with 2422 additions and 294 deletions

View File

@@ -0,0 +1,164 @@
test compile
target aarch64
function %snarrow_i16x8(i16) -> i8x16 {
gv0 = dyn_scale_target_const.i16x8
gv1 = dyn_scale_target_const.i8x16
dt0 = i16x8*gv0
dt1 = i8x16*gv0
block0(v0: i16):
v1 = splat.dt0 v0
v2 = snarrow.dt0 v1, v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.8h, w0
; nextln: sqxtn v0.8b, v2.8h
; nextln: sqxtn2 v0.16b, v2.8h
; nextln: ret
function %snarrow_i32x4(i32) -> i16x8 {
gv0 = dyn_scale_target_const.i32x4
gv1 = dyn_scale_target_const.i16x8
dt0 = i32x4*gv0
dt1 = i16x8*gv0
block0(v0: i32):
v1 = splat.dt0 v0
v2 = snarrow.dt0 v1, v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.4s, w0
; nextln: sqxtn v0.4h, v2.4s
; nextln: sqxtn2 v0.8h, v2.4s
; nextln: ret
function %snarrow_i64x2(i64) -> i32x4 {
gv0 = dyn_scale_target_const.i64x2
gv1 = dyn_scale_target_const.i32x4
dt0 = i64x2*gv0
dt1 = i32x4*gv0
block0(v0: i64):
v1 = splat.dt0 v0
v2 = snarrow.dt0 v1, v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.2d, x0
; nextln: sqxtn v0.2s, v2.2d
; nextln: sqxtn2 v0.4s, v2.2d
; nextln: ret
function %unarrow_i16x8(i16) -> i8x16 {
gv0 = dyn_scale_target_const.i16x8
gv1 = dyn_scale_target_const.i8x16
dt0 = i16x8*gv0
dt1 = i8x16*gv0
block0(v0: i16):
v1 = splat.dt0 v0
v2 = unarrow.dt0 v1, v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.8h, w0
; nextln: sqxtun v0.8b, v2.8h
; nextln: sqxtun2 v0.16b, v2.8h
; nextln: ret
function %unarrow_i32x4(i32) -> i16x8 {
gv0 = dyn_scale_target_const.i32x4
gv1 = dyn_scale_target_const.i16x8
dt0 = i32x4*gv0
dt1 = i16x8*gv0
block0(v0: i32):
v1 = splat.dt0 v0
v2 = unarrow.dt0 v1, v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.4s, w0
; nextln: sqxtun v0.4h, v2.4s
; nextln: sqxtun2 v0.8h, v2.4s
; nextln: ret
function %unarrow_i64x2(i64) -> i32x4 {
gv0 = dyn_scale_target_const.i64x2
gv1 = dyn_scale_target_const.i32x4
dt0 = i64x2*gv0
dt1 = i32x4*gv0
block0(v0: i64):
v1 = splat.dt0 v0
v2 = unarrow.dt0 v1, v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.2d, x0
; nextln: sqxtun v0.2s, v2.2d
; nextln: sqxtun2 v0.4s, v2.2d
; nextln: ret
function %uunarrow_i16x8(i16) -> i8x16 {
gv0 = dyn_scale_target_const.i16x8
gv1 = dyn_scale_target_const.i8x16
dt0 = i16x8*gv0
dt1 = i8x16*gv0
block0(v0: i16):
v1 = splat.dt0 v0
v2 = uunarrow.dt0 v1, v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.8h, w0
; nextln: uqxtn v0.8b, v2.8h
; nextln: uqxtn2 v0.16b, v2.8h
; nextln: ret
function %uunarrow_i32x4(i32) -> i16x8 {
gv0 = dyn_scale_target_const.i32x4
gv1 = dyn_scale_target_const.i16x8
dt0 = i32x4*gv0
dt1 = i16x8*gv0
block0(v0: i32):
v1 = splat.dt0 v0
v2 = uunarrow.dt0 v1, v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.4s, w0
; nextln: uqxtn v0.4h, v2.4s
; nextln: uqxtn2 v0.8h, v2.4s
; nextln: ret
function %uunarrow_i64x2(i64) -> i32x4 {
gv0 = dyn_scale_target_const.i64x2
gv1 = dyn_scale_target_const.i32x4
dt0 = i64x2*gv0
dt1 = i32x4*gv0
block0(v0: i64):
v1 = splat.dt0 v0
v2 = uunarrow.dt0 v1, v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.2d, x0
; nextln: uqxtn v0.2s, v2.2d
; nextln: uqxtn2 v0.4s, v2.2d
; nextln: ret

View File

@@ -0,0 +1,104 @@
test compile
target aarch64
function %i8x16_splat_add(i8, i8) -> i8x16 {
gv0 = dyn_scale_target_const.i8x16
dt0 = i8x16*gv0
block0(v0: i8, v1: i8):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = iadd v2, v3
v5 = extract_vector v4, 0
return v5
}
; check: dup v4.16b, w0
; nextln: dup v6.16b, w1
; nextln: add v0.16b, v4.16b, v6.16b
; nextln: ret
function %i16x8_splat_add(i16, i16) -> i16x8 {
gv0 = dyn_scale_target_const.i16x8
dt0 = i16x8*gv0
block0(v0: i16, v1: i16):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = iadd v2, v3
v5 = extract_vector v4, 0
return v5
}
; check: dup v4.8h, w0
; nextln: dup v6.8h, w1
; nextln: add v0.8h, v4.8h, v6.8h
; nextln: ret
function %i32x4_splat_mul(i32, i32) -> i32x4 {
gv0 = dyn_scale_target_const.i32x4
dt0 = i32x4*gv0
block0(v0: i32, v1: i32):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = imul v2, v3
v5 = extract_vector v4, 0
return v5
}
; check: dup v4.4s, w0
; nextln: dup v6.4s, w1
; nextln: mul v0.4s, v4.4s, v6.4s
; nextln: ret
function %i64x2_splat_sub(i64, i64) -> i64x2 {
gv0 = dyn_scale_target_const.i64x2
dt0 = i64x2*gv0
block0(v0: i64, v1: i64):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = isub v2, v3
v5 = extract_vector v4, 0
return v5
}
; check: dup v4.2d, x0
; nextln: dup v6.2d, x1
; nextln: sub v0.2d, v4.2d, v6.2d
; nextln: ret
function %f32x4_splat_add(f32, f32) -> f32x4 {
gv0 = dyn_scale_target_const.f32x4
dt0 = f32x4*gv0
block0(v0: f32, v1: f32):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = fadd v2, v3
v5 = extract_vector v4, 0
return v5
}
; check: dup v4.4s, v0.s[0]
; nextln: dup v6.4s, v1.s[0]
; nextln: fadd v0.4s, v4.4s, v6.4s
; nextln: ret
function %f64x2_splat_sub(f64, f64) -> f64x2 {
gv0 = dyn_scale_target_const.f64x2
dt0 = f64x2*gv0
block0(v0: f64, v1: f64):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = fsub v2, v3
v5 = extract_vector v4, 0
return v5
}
; check: dup v4.2d, v0.d[0]
; nextln: dup v6.2d, v1.d[0]
; nextln: fsub v0.2d, v4.2d, v6.2d
; nextln: ret

View File

@@ -0,0 +1,104 @@
test compile
target aarch64
function %swidenhigh_i8x16(i8) -> i16x8 {
gv0 = dyn_scale_target_const.i16x8
gv1 = dyn_scale_target_const.i8x16
dt0 = i8x16*gv1
dt1 = i16x8*gv0
block0(v0: i8):
v1 = splat.dt0 v0
v2 = swiden_high v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.16b, w0
; nextln: sxtl2 v0.8h, v2.16b
; nextln: ret
function %swidenhigh_i16x8(i16) -> i32x4 {
gv0 = dyn_scale_target_const.i32x4
gv1 = dyn_scale_target_const.i16x8
dt0 = i16x8*gv1
dt1 = i32x4*gv0
block0(v0: i16):
v1 = splat.dt0 v0
v2 = swiden_high v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.8h, w0
; nextln: sxtl2 v0.4s, v2.8h
; nextln: ret
function %swidenhigh_i32x4(i32) -> i64x2 {
gv0 = dyn_scale_target_const.i32x4
gv1 = dyn_scale_target_const.i64x2
dt0 = i64x2*gv1
dt1 = i32x4*gv0
block0(v0: i32):
v1 = splat.dt1 v0
v2 = swiden_high v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.4s, w0
; nextln: sxtl2 v0.2d, v2.4s
; nextln: ret
function %swidenlow_i8x16(i8) -> i16x8 {
gv0 = dyn_scale_target_const.i16x8
gv1 = dyn_scale_target_const.i8x16
dt0 = i8x16*gv1
dt1 = i16x8*gv0
block0(v0: i8):
v1 = splat.dt0 v0
v2 = swiden_low v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.16b, w0
; nextln: sxtl v0.8h, v2.8b
; nextln: ret
function %swidenlow_i16x8(i16) -> i32x4 {
gv0 = dyn_scale_target_const.i32x4
gv1 = dyn_scale_target_const.i16x8
dt0 = i16x8*gv1
dt1 = i32x4*gv0
block0(v0: i16):
v1 = splat.dt0 v0
v2 = swiden_low v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.8h, w0
; nextln: sxtl v0.4s, v2.4h
; nextln: ret
function %swidenlow_i32x4(i32) -> i64x2 {
gv0 = dyn_scale_target_const.i32x4
gv1 = dyn_scale_target_const.i64x2
dt0 = i64x2*gv1
dt1 = i32x4*gv0
block0(v0: i32):
v1 = splat.dt1 v0
v2 = swiden_low v1
v3 = extract_vector v2, 0
return v3
}
; check: dup v2.4s, w0
; nextln: sxtl v0.2d, v2.2s
; nextln: ret

View File

@@ -0,0 +1,129 @@
test compile precise-output
target aarch64
function %store_scale() {
gv0 = dyn_scale_target_const.i32x4
ss0 = explicit_slot 8
block0:
v0 = global_value.i64 gv0
stack_store.i64 v0, ss0
return
}
; stp fp, lr, [sp, #-16]!
; mov fp, sp
; sub sp, sp, #16
; block0:
; mov x0, sp
; movz x2, #1
; str x2, [x0]
; add sp, sp, #16
; ldp fp, lr, [sp], #16
; ret
function %store_scale_lt_128() {
gv0 = dyn_scale_target_const.i16x4
ss0 = explicit_slot 8
block0:
v0 = global_value.i64 gv0
stack_store.i64 v0, ss0
return
}
; stp fp, lr, [sp, #-16]!
; mov fp, sp
; sub sp, sp, #16
; block0:
; mov x0, sp
; movz x2, #1
; str x2, [x0]
; add sp, sp, #16
; ldp fp, lr, [sp], #16
; ret
function %store_explicit(i32) {
gv0 = dyn_scale_target_const.i32x4
dt0 = i32x4*gv0
dss0 = explicit_dynamic_slot dt0
block0(v0: i32):
v1 = splat.dt0 v0
dynamic_stack_store.dt0 v1, dss0
return
}
; stp fp, lr, [sp, #-16]!
; mov fp, sp
; sub sp, sp, #16
; block0:
; dup v2.4s, w0
; mov x4, sp
; str q2, [x4]
; add sp, sp, #16
; ldp fp, lr, [sp], #16
; ret
function %load_explicit() -> i32x4 {
gv0 = dyn_scale_target_const.i32x4
dt0 = i32x4*gv0
dss0 = explicit_dynamic_slot dt0
block0:
v0 = dynamic_stack_load.dt0 dss0
v1 = extract_vector.dt0 v0, 0
return v1
}
; stp fp, lr, [sp, #-16]!
; mov fp, sp
; sub sp, sp, #16
; block0:
; mov x3, sp
; ldr q0, [x3]
; add sp, sp, #16
; ldp fp, lr, [sp], #16
; ret
function %store_implicit(i32) {
gv0 = dyn_scale_target_const.i32x4
dt0 = i32x4*gv0
dss0 = explicit_dynamic_slot dt0
block0(v0: i32):
v1 = splat.dt0 v0
dynamic_stack_store v1, dss0
return
}
; stp fp, lr, [sp, #-16]!
; mov fp, sp
; sub sp, sp, #16
; block0:
; dup v2.4s, w0
; mov x4, sp
; str q2, [x4]
; add sp, sp, #16
; ldp fp, lr, [sp], #16
; ret
function %addr() -> i64 {
gv0 = dyn_scale_target_const.i32x4
dt0 = i32x4*gv0
dss0 = explicit_dynamic_slot dt0
block0:
v0 = dynamic_stack_addr.i64 dss0
return v0
}
; stp fp, lr, [sp, #-16]!
; mov fp, sp
; sub sp, sp, #16
; block0:
; mov x0, sp
; add sp, sp, #16
; ldp fp, lr, [sp], #16
; ret

View File

@@ -0,0 +1,197 @@
test run
target aarch64
function %i8x16_splat_add(i8, i8) -> i8x16 {
gv0 = dyn_scale_target_const.i8x16
dt0 = i8x16*gv0
block0(v0: i8, v1: i8):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = iadd v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i8x16_splat_add(1, 3) == [4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]
function %i16x8_splat_add(i16, i16) -> i16x8 {
gv0 = dyn_scale_target_const.i16x8
dt0 = i16x8*gv0
block0(v0: i16, v1: i16):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = iadd v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i16x8_splat_add(255, 254) == [509 509 509 509 509 509 509 509]
function %i32x4_splat_add(i32, i32) -> i32x4 {
gv0 = dyn_scale_target_const.i32x4
dt0 = i32x4*gv0
block0(v0: i32, v1: i32):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = iadd v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i32sv_splat_add(1234, 8765) == [9999 9999 9999 9999]
function %i64x2_splat_add(i64, i64) -> i64x2 {
gv0 = dyn_scale_target_const.i64x2
dt0 = i64x2*gv0
block0(v0: i64, v1: i64):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = iadd v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i64x2_splat_add(4321, 8765) == [13086 13086]
function %i8x16_splat_sub(i8, i8) -> i8x16 {
gv0 = dyn_scale_target_const.i8x16
dt0 = i8x16*gv0
block0(v0: i8, v1: i8):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = isub v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i8x16_splat_sub(127, 126) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
function %i16x8_splat_sub(i16, i16) -> i16x8 {
gv0 = dyn_scale_target_const.i16x8
dt0 = i16x8*gv0
block0(v0: i16, v1: i16):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = isub v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i16x8_splat_sub(12345, 6789) == [5556 5556 5556 5556 5556 5556 5556 5556]
function %i32x4_splat_sub(i32, i32) -> i32x4 {
gv0 = dyn_scale_target_const.i32x4
dt0 = i32x4*gv0
block0(v0: i32, v1: i32):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = isub v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i32x4_splat_sub(1, 3) == [-2 -2 -2 -2]
function %i64x2_splat_sub(i64, i64) -> i64x2 {
gv0 = dyn_scale_target_const.i64x2
dt0 = i64x2*gv0
block0(v0: i64, v1: i64):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = isub v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i64x2_splat_sub(255, 65535) == [-65280 -65280]
function %i8x16_splat_mul(i8, i8) -> i8x16 {
gv0 = dyn_scale_target_const.i8x16
dt0 = i8x16*gv0
block0(v0: i8, v1: i8):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = imul v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i8x16_splat_mul(15, 15) == [225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225]
function %i16x8_splat_mul(i16, i16) -> i16x8 {
gv0 = dyn_scale_target_const.i16x8
dt0 = i16x8*gv0
block0(v0: i16, v1: i16):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = imul v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i16x8_splat_mul(135, 246) == [33210 33210 33210 33210 33210 33210 33210 33210]
function %i32x4_splat_mul(i32, i32) -> i32x4 {
gv0 = dyn_scale_target_const.i32x4
dt0 = i32x4*gv0
block0(v0: i32, v1: i32):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = imul v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %i32x4_splat_mul(2, 3) == [6 6 6 6]
function %f32x4_splat_add(f32, f32) -> f32x4 {
gv0 = dyn_scale_target_const.f32x4
dt0 = f32x4*gv0
block0(v0: f32, v1: f32):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = fadd v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %f32x4_splat_add(0x1.2, 0x3.4) == [0x4.6 0x4.6 0x4.6 0x4.6]
function %f64x2_splat_add(f64, f64) -> f64x2 {
gv0 = dyn_scale_target_const.f64x2
dt0 = f64x2*gv0
block0(v0: f64, v1: f64):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = fadd v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %f64x2_splat_add(0x1.0, 0x2.0) == [0x3.0 0x3.0]
function %f32x4_splat_sub(f32, f32) -> f32x4 {
gv0 = dyn_scale_target_const.f32x4
dt0 = f32x4*gv0
block0(v0: f32, v1: f32):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = fsub v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %f32x4_splat_sub(0x1.2, 0x3.4) == [-0x2.2 -0x2.2 -0x2.2 -0x2.2]
function %f64x2_splat_sub(f64, f64) -> f64x2 {
gv0 = dyn_scale_target_const.f64x2
dt0 = f64x2*gv0
block0(v0: f64, v1: f64):
v2 = splat.dt0 v0
v3 = splat.dt0 v1
v4 = fsub v2, v3
v5 = extract_vector v4, 0
return v5
}
; run: %f64x2_splat_sub(0x1.0, 0x3.0) == [-0x2.0 -0x2.0]