[RFC] Dynamic Vector Support (#4200)

Introduce a new concept in the IR that allows a producer to create dynamic vector types. An IR function can now contain global value(s) that represent a dynamic scaling factor, for a given fixed-width vector type. A dynamic type is then created by 'multiplying' the corresponding global value with a fixed-width type. These new types can be used just like the existing types and the type system has a set of hard-coded dynamic types, such as I32X4XN, which the user defined types map onto. The dynamic types are also used explicitly to create dynamic stack slots, which have no set size like their existing counterparts. New IR instructions are added to access these new stack entities. Currently, during codegen, the dynamic scaling factor has to be lowered to a constant so the dynamic slots do eventually have a compile-time known size, as do spill slots. The current lowering for aarch64 just targets Neon, using a dynamic scale of 1. Copyright (c) 2022, Arm Limited.
2022-07-07 20:54:39 +01:00
parent 9ae060a12a
commit 9c43749dfe
69 changed files with 2422 additions and 294 deletions
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-narrow.clif
@@ -0,0 +1,164 @@
+test compile
+target aarch64
+
+function %snarrow_i16x8(i16) -> i8x16 {
+  gv0 = dyn_scale_target_const.i16x8
+  gv1 = dyn_scale_target_const.i8x16
+  dt0 = i16x8*gv0
+  dt1 = i8x16*gv0
+
+block0(v0: i16):
+  v1 = splat.dt0 v0
+  v2 = snarrow.dt0 v1, v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.8h, w0
+; nextln: sqxtn v0.8b, v2.8h
+; nextln: sqxtn2 v0.16b, v2.8h
+; nextln: ret
+
+function %snarrow_i32x4(i32) -> i16x8 {
+  gv0 = dyn_scale_target_const.i32x4
+  gv1 = dyn_scale_target_const.i16x8
+  dt0 = i32x4*gv0
+  dt1 = i16x8*gv0
+
+block0(v0: i32):
+  v1 = splat.dt0 v0
+  v2 = snarrow.dt0 v1, v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.4s, w0
+; nextln: sqxtn v0.4h, v2.4s
+; nextln: sqxtn2 v0.8h, v2.4s
+; nextln: ret
+
+function %snarrow_i64x2(i64) -> i32x4 {
+  gv0 = dyn_scale_target_const.i64x2
+  gv1 = dyn_scale_target_const.i32x4
+  dt0 = i64x2*gv0
+  dt1 = i32x4*gv0
+
+block0(v0: i64):
+  v1 = splat.dt0 v0
+  v2 = snarrow.dt0 v1, v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.2d, x0
+; nextln: sqxtn v0.2s, v2.2d
+; nextln: sqxtn2 v0.4s, v2.2d
+; nextln: ret
+
+function %unarrow_i16x8(i16) -> i8x16 {
+  gv0 = dyn_scale_target_const.i16x8
+  gv1 = dyn_scale_target_const.i8x16
+  dt0 = i16x8*gv0
+  dt1 = i8x16*gv0
+
+block0(v0: i16):
+  v1 = splat.dt0 v0
+  v2 = unarrow.dt0 v1, v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.8h, w0
+; nextln: sqxtun v0.8b, v2.8h
+; nextln: sqxtun2 v0.16b, v2.8h
+; nextln: ret
+
+function %unarrow_i32x4(i32) -> i16x8 {
+  gv0 = dyn_scale_target_const.i32x4
+  gv1 = dyn_scale_target_const.i16x8
+  dt0 = i32x4*gv0
+  dt1 = i16x8*gv0
+
+block0(v0: i32):
+  v1 = splat.dt0 v0
+  v2 = unarrow.dt0 v1, v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.4s, w0
+; nextln: sqxtun v0.4h, v2.4s
+; nextln: sqxtun2 v0.8h, v2.4s
+; nextln: ret
+
+function %unarrow_i64x2(i64) -> i32x4 {
+  gv0 = dyn_scale_target_const.i64x2
+  gv1 = dyn_scale_target_const.i32x4
+  dt0 = i64x2*gv0
+  dt1 = i32x4*gv0
+
+block0(v0: i64):
+  v1 = splat.dt0 v0
+  v2 = unarrow.dt0 v1, v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.2d, x0
+; nextln: sqxtun v0.2s, v2.2d
+; nextln: sqxtun2 v0.4s, v2.2d
+; nextln: ret
+
+function %uunarrow_i16x8(i16) -> i8x16 {
+  gv0 = dyn_scale_target_const.i16x8
+  gv1 = dyn_scale_target_const.i8x16
+  dt0 = i16x8*gv0
+  dt1 = i8x16*gv0
+
+block0(v0: i16):
+  v1 = splat.dt0 v0
+  v2 = uunarrow.dt0 v1, v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.8h, w0
+; nextln: uqxtn v0.8b, v2.8h
+; nextln: uqxtn2 v0.16b, v2.8h
+; nextln: ret
+
+function %uunarrow_i32x4(i32) -> i16x8 {
+  gv0 = dyn_scale_target_const.i32x4
+  gv1 = dyn_scale_target_const.i16x8
+  dt0 = i32x4*gv0
+  dt1 = i16x8*gv0
+
+block0(v0: i32):
+  v1 = splat.dt0 v0
+  v2 = uunarrow.dt0 v1, v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.4s, w0
+; nextln: uqxtn v0.4h, v2.4s
+; nextln: uqxtn2 v0.8h, v2.4s
+; nextln: ret
+
+function %uunarrow_i64x2(i64) -> i32x4 {
+  gv0 = dyn_scale_target_const.i64x2
+  gv1 = dyn_scale_target_const.i32x4
+  dt0 = i64x2*gv0
+  dt1 = i32x4*gv0
+
+block0(v0: i64):
+  v1 = splat.dt0 v0
+  v2 = uunarrow.dt0 v1, v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.2d, x0
+; nextln: uqxtn v0.2s, v2.2d
+; nextln: uqxtn2 v0.4s, v2.2d
+; nextln: ret
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-neon.clif
@@ -0,0 +1,104 @@
+test compile
+target aarch64
+
+function %i8x16_splat_add(i8, i8) -> i8x16 {
+  gv0 = dyn_scale_target_const.i8x16
+  dt0 = i8x16*gv0
+
+block0(v0: i8, v1: i8):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = iadd v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+
+; check:  dup v4.16b, w0
+; nextln: dup v6.16b, w1
+; nextln: add v0.16b, v4.16b, v6.16b
+; nextln: ret
+
+function %i16x8_splat_add(i16, i16) -> i16x8 {
+  gv0 = dyn_scale_target_const.i16x8
+  dt0 = i16x8*gv0
+
+block0(v0: i16, v1: i16):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = iadd v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+
+; check:  dup v4.8h, w0
+; nextln: dup v6.8h, w1
+; nextln: add v0.8h, v4.8h, v6.8h
+; nextln: ret
+
+function %i32x4_splat_mul(i32, i32) -> i32x4 {
+  gv0 = dyn_scale_target_const.i32x4
+  dt0 = i32x4*gv0
+
+block0(v0: i32, v1: i32):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = imul v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+
+; check:  dup v4.4s, w0
+; nextln: dup v6.4s, w1
+; nextln: mul v0.4s, v4.4s, v6.4s
+; nextln: ret
+
+function %i64x2_splat_sub(i64, i64) -> i64x2 {
+  gv0 = dyn_scale_target_const.i64x2
+  dt0 = i64x2*gv0
+
+block0(v0: i64, v1: i64):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = isub v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+
+; check:  dup v4.2d, x0
+; nextln: dup v6.2d, x1
+; nextln: sub v0.2d, v4.2d, v6.2d
+; nextln: ret
+
+function %f32x4_splat_add(f32, f32) -> f32x4 {
+  gv0 = dyn_scale_target_const.f32x4
+  dt0 = f32x4*gv0
+
+block0(v0: f32, v1: f32):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = fadd v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+
+; check:  dup v4.4s, v0.s[0]
+; nextln: dup v6.4s, v1.s[0]
+; nextln: fadd v0.4s, v4.4s, v6.4s
+; nextln: ret
+
+function %f64x2_splat_sub(f64, f64) -> f64x2 {
+  gv0 = dyn_scale_target_const.f64x2
+  dt0 = f64x2*gv0
+
+block0(v0: f64, v1: f64):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = fsub v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+
+; check:  dup v4.2d, v0.d[0]
+; nextln: dup v6.2d, v1.d[0]
+; nextln: fsub v0.2d, v4.2d, v6.2d
+; nextln: ret
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-simd-widen.clif
@@ -0,0 +1,104 @@
+test compile
+target aarch64
+
+function %swidenhigh_i8x16(i8) -> i16x8 {
+  gv0 = dyn_scale_target_const.i16x8
+  gv1 = dyn_scale_target_const.i8x16
+  dt0 = i8x16*gv1
+  dt1 = i16x8*gv0
+
+block0(v0: i8):
+  v1 = splat.dt0 v0
+  v2 = swiden_high v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.16b, w0
+; nextln: sxtl2 v0.8h, v2.16b
+; nextln: ret
+
+function %swidenhigh_i16x8(i16) -> i32x4 {
+  gv0 = dyn_scale_target_const.i32x4
+  gv1 = dyn_scale_target_const.i16x8
+  dt0 = i16x8*gv1
+  dt1 = i32x4*gv0
+
+block0(v0: i16):
+  v1 = splat.dt0 v0
+  v2 = swiden_high v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.8h, w0
+; nextln: sxtl2 v0.4s, v2.8h
+; nextln: ret
+
+function %swidenhigh_i32x4(i32) -> i64x2 {
+  gv0 = dyn_scale_target_const.i32x4
+  gv1 = dyn_scale_target_const.i64x2
+  dt0 = i64x2*gv1
+  dt1 = i32x4*gv0
+
+block0(v0: i32):
+  v1 = splat.dt1 v0
+  v2 = swiden_high v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.4s, w0
+; nextln: sxtl2 v0.2d, v2.4s
+; nextln: ret
+
+function %swidenlow_i8x16(i8) -> i16x8 {
+  gv0 = dyn_scale_target_const.i16x8
+  gv1 = dyn_scale_target_const.i8x16
+  dt0 = i8x16*gv1
+  dt1 = i16x8*gv0
+
+block0(v0: i8):
+  v1 = splat.dt0 v0
+  v2 = swiden_low v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.16b, w0
+; nextln: sxtl v0.8h, v2.8b
+; nextln: ret
+
+function %swidenlow_i16x8(i16) -> i32x4 {
+  gv0 = dyn_scale_target_const.i32x4
+  gv1 = dyn_scale_target_const.i16x8
+  dt0 = i16x8*gv1
+  dt1 = i32x4*gv0
+
+block0(v0: i16):
+  v1 = splat.dt0 v0
+  v2 = swiden_low v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.8h, w0
+; nextln: sxtl v0.4s, v2.4h
+; nextln: ret
+
+function %swidenlow_i32x4(i32) -> i64x2 {
+  gv0 = dyn_scale_target_const.i32x4
+  gv1 = dyn_scale_target_const.i64x2
+  dt0 = i64x2*gv1
+  dt1 = i32x4*gv0
+
+block0(v0: i32):
+  v1 = splat.dt1 v0
+  v2 = swiden_low v1
+  v3 = extract_vector v2, 0
+  return v3
+}
+
+; check: dup v2.4s, w0
+; nextln: sxtl v0.2d, v2.2s
+; nextln: ret
--- a/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif
@@ -0,0 +1,129 @@
+test compile precise-output
+target aarch64
+
+function %store_scale() {
+  gv0 = dyn_scale_target_const.i32x4
+  ss0 = explicit_slot 8
+
+block0:
+  v0 = global_value.i64 gv0
+  stack_store.i64 v0, ss0
+  return
+}
+
+;   stp fp, lr, [sp, #-16]!
+;   mov fp, sp
+;   sub sp, sp, #16
+; block0:
+;   mov x0, sp
+;   movz x2, #1
+;   str x2, [x0]
+;   add sp, sp, #16
+;   ldp fp, lr, [sp], #16
+;   ret
+
+function %store_scale_lt_128() {
+  gv0 = dyn_scale_target_const.i16x4
+  ss0 = explicit_slot 8
+
+block0:
+  v0 = global_value.i64 gv0
+  stack_store.i64 v0, ss0
+  return
+}
+
+;   stp fp, lr, [sp, #-16]!
+;   mov fp, sp
+;   sub sp, sp, #16
+; block0:
+;   mov x0, sp
+;   movz x2, #1
+;   str x2, [x0]
+;   add sp, sp, #16
+;   ldp fp, lr, [sp], #16
+;   ret
+
+function %store_explicit(i32) {
+  gv0 = dyn_scale_target_const.i32x4
+  dt0 = i32x4*gv0
+  dss0 = explicit_dynamic_slot dt0
+
+block0(v0: i32):
+  v1 = splat.dt0 v0
+  dynamic_stack_store.dt0 v1, dss0
+  return
+}
+
+;   stp fp, lr, [sp, #-16]!
+;   mov fp, sp
+;   sub sp, sp, #16
+; block0:
+;   dup v2.4s, w0
+;   mov x4, sp
+;   str q2, [x4]
+;   add sp, sp, #16
+;   ldp fp, lr, [sp], #16
+;   ret
+
+function %load_explicit() -> i32x4 {
+  gv0 = dyn_scale_target_const.i32x4
+  dt0 = i32x4*gv0
+  dss0 = explicit_dynamic_slot dt0
+
+block0:
+  v0 = dynamic_stack_load.dt0 dss0
+  v1 = extract_vector.dt0 v0, 0
+  return v1
+}
+
+;   stp fp, lr, [sp, #-16]!
+;   mov fp, sp
+;   sub sp, sp, #16
+; block0:
+;   mov x3, sp
+;   ldr q0, [x3]
+;   add sp, sp, #16
+;   ldp fp, lr, [sp], #16
+;   ret
+
+function %store_implicit(i32) {
+  gv0 = dyn_scale_target_const.i32x4
+  dt0 = i32x4*gv0
+  dss0 = explicit_dynamic_slot dt0
+
+block0(v0: i32):
+  v1 = splat.dt0 v0
+  dynamic_stack_store v1, dss0
+  return
+}
+
+;   stp fp, lr, [sp, #-16]!
+;   mov fp, sp
+;   sub sp, sp, #16
+; block0:
+;   dup v2.4s, w0
+;   mov x4, sp
+;   str q2, [x4]
+;   add sp, sp, #16
+;   ldp fp, lr, [sp], #16
+;   ret
+
+function %addr() -> i64 {
+  gv0 = dyn_scale_target_const.i32x4
+  dt0 = i32x4*gv0
+  dss0 = explicit_dynamic_slot dt0
+
+block0:
+  v0 = dynamic_stack_addr.i64 dss0
+  return v0
+}
+
+;   stp fp, lr, [sp, #-16]!
+;   mov fp, sp
+;   sub sp, sp, #16
+; block0:
+;   mov x0, sp
+;   add sp, sp, #16
+;   ldp fp, lr, [sp], #16
+;   ret
+
--- a/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif
+++ b/cranelift/filetests/filetests/runtests/dynamic-simd-arithmetic.clif
@@ -0,0 +1,197 @@
+test run
+target aarch64
+
+function %i8x16_splat_add(i8, i8) -> i8x16 {
+  gv0 = dyn_scale_target_const.i8x16
+  dt0 = i8x16*gv0
+
+block0(v0: i8, v1: i8):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = iadd v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i8x16_splat_add(1, 3) == [4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]
+
+function %i16x8_splat_add(i16, i16) -> i16x8 {
+  gv0 = dyn_scale_target_const.i16x8
+  dt0 = i16x8*gv0
+
+block0(v0: i16, v1: i16):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = iadd v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i16x8_splat_add(255, 254) == [509 509 509 509 509 509 509 509]
+
+function %i32x4_splat_add(i32, i32) -> i32x4 {
+  gv0 = dyn_scale_target_const.i32x4
+  dt0 = i32x4*gv0
+
+block0(v0: i32, v1: i32):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = iadd v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i32sv_splat_add(1234, 8765) == [9999 9999 9999 9999]
+
+function %i64x2_splat_add(i64, i64) -> i64x2 {
+  gv0 = dyn_scale_target_const.i64x2
+  dt0 = i64x2*gv0
+
+block0(v0: i64, v1: i64):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = iadd v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i64x2_splat_add(4321, 8765) == [13086 13086]
+
+function %i8x16_splat_sub(i8, i8) -> i8x16 {
+  gv0 = dyn_scale_target_const.i8x16
+  dt0 = i8x16*gv0
+
+block0(v0: i8, v1: i8):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = isub v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i8x16_splat_sub(127, 126) == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+function %i16x8_splat_sub(i16, i16) -> i16x8 {
+  gv0 = dyn_scale_target_const.i16x8
+  dt0 = i16x8*gv0
+
+block0(v0: i16, v1: i16):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = isub v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i16x8_splat_sub(12345, 6789) == [5556 5556 5556 5556 5556 5556 5556 5556]
+
+function %i32x4_splat_sub(i32, i32) -> i32x4 {
+  gv0 = dyn_scale_target_const.i32x4
+  dt0 = i32x4*gv0
+
+block0(v0: i32, v1: i32):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = isub v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i32x4_splat_sub(1, 3) == [-2 -2 -2 -2]
+
+function %i64x2_splat_sub(i64, i64) -> i64x2 {
+  gv0 = dyn_scale_target_const.i64x2
+  dt0 = i64x2*gv0
+
+block0(v0: i64, v1: i64):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = isub v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i64x2_splat_sub(255, 65535) == [-65280 -65280]
+
+function %i8x16_splat_mul(i8, i8) -> i8x16 {
+  gv0 = dyn_scale_target_const.i8x16
+  dt0 = i8x16*gv0
+
+block0(v0: i8, v1: i8):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = imul v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i8x16_splat_mul(15, 15) == [225 225 225 225 225 225 225 225 225 225 225 225 225 225 225 225]
+
+function %i16x8_splat_mul(i16, i16) -> i16x8 {
+  gv0 = dyn_scale_target_const.i16x8
+  dt0 = i16x8*gv0
+
+block0(v0: i16, v1: i16):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = imul v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i16x8_splat_mul(135, 246) == [33210 33210 33210 33210 33210 33210 33210 33210]
+
+function %i32x4_splat_mul(i32, i32) -> i32x4 {
+  gv0 = dyn_scale_target_const.i32x4
+  dt0 = i32x4*gv0
+
+block0(v0: i32, v1: i32):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = imul v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %i32x4_splat_mul(2, 3) == [6 6 6 6]
+
+function %f32x4_splat_add(f32, f32) -> f32x4 {
+  gv0 = dyn_scale_target_const.f32x4
+  dt0 = f32x4*gv0
+
+block0(v0: f32, v1: f32):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = fadd v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %f32x4_splat_add(0x1.2, 0x3.4) == [0x4.6 0x4.6 0x4.6 0x4.6]
+
+function %f64x2_splat_add(f64, f64) -> f64x2 {
+  gv0 = dyn_scale_target_const.f64x2
+  dt0 = f64x2*gv0
+
+block0(v0: f64, v1: f64):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = fadd v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %f64x2_splat_add(0x1.0, 0x2.0) == [0x3.0 0x3.0]
+
+function %f32x4_splat_sub(f32, f32) -> f32x4 {
+  gv0 = dyn_scale_target_const.f32x4
+  dt0 = f32x4*gv0
+
+block0(v0: f32, v1: f32):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = fsub v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %f32x4_splat_sub(0x1.2, 0x3.4) == [-0x2.2 -0x2.2 -0x2.2 -0x2.2]
+
+function %f64x2_splat_sub(f64, f64) -> f64x2 {
+  gv0 = dyn_scale_target_const.f64x2
+  dt0 = f64x2*gv0
+
+block0(v0: f64, v1: f64):
+  v2 = splat.dt0 v0
+  v3 = splat.dt0 v1
+  v4 = fsub v2, v3
+  v5 = extract_vector v4, 0
+  return v5
+}
+; run: %f64x2_splat_sub(0x1.0, 0x3.0) == [-0x2.0 -0x2.0]