Add a MemFlags operand to the bitcast instruction, where only the `big` and `little` flags are accepted. These define the lane order to be used when casting between types of different lane counts. Update all users to pass an appropriate MemFlags argument. Implement lane swaps where necessary in the s390x back-end. This is the final part necessary to fix https://github.com/bytecodealliance/wasmtime/issues/4566.
212 lines
5.5 KiB
Plaintext
212 lines
5.5 KiB
Plaintext
test run
|
|
target aarch64
|
|
target s390x
|
|
set enable_simd
|
|
target x86_64 has_sse3 has_ssse3 has_sse41
|
|
|
|
;; shuffle
|
|
|
|
function %shuffle_different_ssa_values() -> i8x16 {
|
|
block0:
|
|
v0 = vconst.i8x16 0x00
|
|
v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
|
|
v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31] ; use the first lane of v0 throughout except use the last lane of v1
|
|
return v2
|
|
}
|
|
; run: %shuffle_different_ssa_values() == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
|
|
|
|
function %shuffle_same_ssa_value() -> i8x16 {
|
|
block0:
|
|
v0 = vconst.i8x16 0x01000000_00000000_00000000_00000000 ; note where lane 15 is when written with hexadecimal syntax
|
|
v1 = shuffle v0, v0, 0x0f0f0f0f_0f0f0f0f_0f0f0f0f_0f0f0f0f ; use the last lane of v0 to fill all lanes
|
|
return v1
|
|
}
|
|
; run: %shuffle_same_ssa_value() == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
|
|
|
|
function %shuffle_i32x4_in_same_place() -> i32x4 {
|
|
block0:
|
|
v1 = vconst.i32x4 [0 1 2 3]
|
|
v2 = bitcast.i8x16 little v1 ; we have to cast because shuffle is type-limited to Tx16
|
|
; keep each lane in place from the first vector
|
|
v3 = shuffle v2, v2, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
|
v4 = bitcast.i32x4 little v3
|
|
return v4
|
|
}
|
|
; run: %shuffle_in_same_place() == [0 1 2 3]
|
|
|
|
function %shuffle_i32x4_to_all_true() -> i32x4 {
|
|
block0:
|
|
v1 = vconst.i32x4 [-1 0 -1 0]
|
|
v2 = bitcast.i8x16 little v1 ; we have to cast because shuffle is type-limited to Tx16
|
|
; pair up the true values to make the entire vector true
|
|
v3 = shuffle v2, v2, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
|
|
v4 = bitcast.i32x4 little v3 ; TODO store.i32x4 is unavailable; see https://github.com/bytecodealliance/wasmtime/issues/2237
|
|
return v4
|
|
}
|
|
; run: %shuffle_i32x4_to_all_true() == [0xffffffff 0xffffffff 0xffffffff 0xffffffff]
|
|
|
|
|
|
|
|
;; swizzle
|
|
|
|
function %swizzle(i8x16, i8x16) -> i8x16 {
|
|
block0(v0: i8x16, v1: i8x16):
|
|
v2 = swizzle.i8x16 v0, v1
|
|
return v2
|
|
}
|
|
; reverse the lanes, with over-large index 42 using lane 0
|
|
; run: %swizzle([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 42]) == [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
|
|
; 250 should overflow but saturate so that the MSB is set (PSHUFB uses this to shuffle from lane 0)
|
|
; run: %swizzle([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], [16 250 0 0 0 0 0 0 0 0 0 0 0 0 0 0]) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
|
|
|
|
|
|
|
;; insertlane
|
|
|
|
function %insertlane_i8x16_first(i8x16, i8) -> i8x16 {
|
|
block0(v1: i8x16, v2: i8):
|
|
v3 = insertlane v1, v2, 0
|
|
return v3
|
|
}
|
|
; run: %insertlane_i8x16_first([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], 0xff) == [0xff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
|
|
|
|
function %insertlane_f32x4_second(f32x4, f32) -> f32x4 {
|
|
block0(v1: f32x4, v2: f32):
|
|
v3 = insertlane v1, v2, 1
|
|
return v3
|
|
}
|
|
; run: %insertlane_f32x4_second([0.0 0.0 0.0 0.0], 0x42.42) == [0.0 0x42.42 0.0 0.0]
|
|
|
|
function %insertlane_f64x2_first(f64x2, f64) -> f64x2 {
|
|
block0(v1: f64x2, v2: f64):
|
|
v3 = insertlane v1, v2, 0
|
|
return v3
|
|
}
|
|
; run: %insertlane_f64x2_first([0.0 0.0], 0x42.42) == [0x42.42 0.0]
|
|
|
|
function %insertlane_f64x2_second(f64x2, f64) -> f64x2 {
|
|
block0(v1: f64x2, v2: f64):
|
|
v3 = insertlane v1, v2, 1
|
|
return v3
|
|
}
|
|
; run: %insertlane_f64x2_second([0.0 0.0], 0x42.42) == [0.0 0x42.42]
|
|
|
|
|
|
|
|
;; extractlane
|
|
|
|
function %extractlane_i8x16() -> i8 {
|
|
block0:
|
|
v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 -1 0 0
|
|
0 0 0]
|
|
v2 = extractlane v1, 10
|
|
v3 = bitcast.i8 v2
|
|
return v3
|
|
}
|
|
; run: %extractlane_i8x16_last() == 0xff
|
|
|
|
function %extractlane_i16x8_second(i16x8) -> i16 {
|
|
block0(v0: i16x8):
|
|
v1 = extractlane v0, 1
|
|
return v1
|
|
}
|
|
; run: %extractlane_i16x8_second(0x00080007000600050004000300020001) == 2
|
|
|
|
function %extractlane_f32x4_last(f32x4) -> f32 {
|
|
block0(v0: f32x4):
|
|
v1 = extractlane v0, 3
|
|
return v1
|
|
}
|
|
; run: %extractlane_f32x4_last([0x00.00 0x00.00 0x00.00 0x42.42]) == 0x42.42
|
|
|
|
function %extractlane_i32_with_vector_reuse() -> i8 {
|
|
block0:
|
|
v0 = iconst.i32 42
|
|
v1 = iconst.i32 99
|
|
|
|
v2 = vconst.i32x4 [42 42 42 42]
|
|
v3 = insertlane v2, v1, 2
|
|
|
|
v4 = extractlane v3, 3
|
|
v5 = icmp eq v4, v0
|
|
|
|
v6 = extractlane v3, 2
|
|
v7 = icmp eq v6, v1
|
|
|
|
v8 = band v5, v7
|
|
return v8
|
|
}
|
|
; run
|
|
|
|
function %extractlane_f32_with_vector_reuse() -> i8 {
|
|
block0:
|
|
v0 = f32const 0x42.42
|
|
v1 = f32const 0x99.99
|
|
|
|
v2 = vconst.f32x4 [0x42.42 0x42.42 0x42.42 0x42.42]
|
|
v3 = insertlane v2, v1, 2
|
|
|
|
v4 = extractlane v3, 3
|
|
v5 = fcmp eq v4, v0
|
|
|
|
v6 = extractlane v3, 2
|
|
v7 = fcmp eq v6, v1
|
|
|
|
v8 = band v5, v7
|
|
return v8
|
|
}
|
|
; run
|
|
|
|
|
|
|
|
;; splat
|
|
|
|
function %splat_i64x2() -> i8 {
|
|
block0:
|
|
v0 = iconst.i64 -1
|
|
v1 = splat.i64x2 v0
|
|
v2 = vconst.i64x2 [-1 -1]
|
|
v3 = icmp eq v1, v2
|
|
v8 = vall_true v3
|
|
return v8
|
|
}
|
|
; run
|
|
|
|
function %splat_i8(i8) -> i8x16 {
|
|
block0(v0: i8):
|
|
v1 = splat.i8x16 v0
|
|
return v1
|
|
}
|
|
; run: %splat_i8(0xff) == [0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff]
|
|
|
|
function %splat_i32(i32) -> i32x4 {
|
|
block0(v0: i32):
|
|
v1 = splat.i32x4 v0
|
|
return v1
|
|
}
|
|
; run: %splat_i32(42) == [42 42 42 42]
|
|
|
|
function %splat_f64(f64) -> f64x2 {
|
|
block0(v0: f64):
|
|
v1 = splat.f64x2 v0
|
|
return v1
|
|
}
|
|
; run: %splat_f64(-0x1.1) == [-0x1.1 -0x1.1]
|
|
|
|
|
|
; narrow
|
|
|
|
function %snarrow(i32x4, i32x4) -> i16x8 {
|
|
block0(v0: i32x4, v1: i32x4):
|
|
v2 = snarrow v0, v1
|
|
return v2
|
|
}
|
|
; run: %snarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 -1 0x7fff 4 5 -6 0xffff]
|
|
|
|
function %unarrow(i32x4, i32x4) -> i16x8 {
|
|
block0(v0: i32x4, v1: i32x4):
|
|
v2 = unarrow v0, v1
|
|
return v2
|
|
}
|
|
; run: %unarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 0 0xffff 4 5 0 0]
|