diff --git a/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif b/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif new file mode 100644 index 0000000000..c59c58188a --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif @@ -0,0 +1,280 @@ +test run +set enable_simd +target x86_64 skylake +feature "experimental_x64" + +function %iadd_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0:i32x4, v1:i32x4): + v2 = iadd v0, v1 + return v2 +} +; run: %iadd_i32x4([1 1 1 1], [1 2 3 4]) == [2 3 4 5] + +function %iadd_i8x16_with_overflow() -> i8x16 { +block0: + v0 = vconst.i8x16 [255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255] + v1 = vconst.i8x16 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] + v2 = iadd v0, v1 + return v2 +} +; run: %iadd_i8x16_with_overflow() == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] + +function %isub_i32x4_rex() -> b1 { +block0: + v0 = vconst.i32x4 [1 1 1 1] + v1 = vconst.i32x4 [1 2 3 4] + v2 = isub v0, v1 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 0 + + v5 = extractlane v2, 1 + v6 = icmp_imm eq v5, 0xffffffff + ; TODO replace extractlanes with vector comparison + + v7 = band v4, v6 + return v7 +} +; run + + +function %ineg_i32x4() -> b1 { +block0: + v0 = vconst.i32x4 [1 1 1 1] + v2 = ineg v0 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, -1 + + return v4 +} +; run + +function %imul_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = imul v0, v1 + return v2 +} +; run: %imul_i64x2([0 2], [0 2]) == [0 4] + +function %imul_i32x4() -> b1 { +block0: + v0 = vconst.i32x4 [-1 0 1 0x80_00_00_01] + v1 = vconst.i32x4 [2 2 2 2] + v2 = imul v0, v1 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, -2 + + v5 = extractlane v2, 1 + v6 = icmp_imm eq v5, 0 + + v7 = extractlane v2, 3 + v8 = icmp_imm eq v7, 2 ; 0x80_00_00_01 * 2 == 0x1_00_00_00_02 (and the 1 is dropped) + + v9 = band v4, v6 + v10 = band v8, v9 + return v10 +} +; run + +function %imul_i16x8() -> b1 { +block0: + v0 = vconst.i16x8 [-1 0 1 0x7f_ff 0 0 0 0] + v1 = vconst.i16x8 [2 2 2 2 0 0 0 0] + v2 = imul v0, v1 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 0xfffe ; 0xfffe == -2; -2 will not work here and below because v3 is + ; being uextend-ed, not sextend-ed + + v5 = extractlane v2, 1 + v6 = icmp_imm eq v5, 0 + + v7 = extractlane v2, 3 + v8 = icmp_imm eq v7, 0xfffe ; 0x7f_ff * 2 == 0xff_fe + + v9 = band v4, v6 + v10 = band v8, v9 + + return v4 +} +; run + +function %sadd_sat_i8x16() -> b1 { +block0: + v0 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + v1 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] + + v2 = sadd_sat v0, v1 + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 127 + + return v4 +} +; run + +function %uadd_sat_i16x8() -> b1 { +block0: + v0 = vconst.i16x8 [-1 0 0 0 0 0 0 0] + v1 = vconst.i16x8 [-1 1 1 1 1 1 1 1] + + v2 = uadd_sat v0, v1 + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 65535 + + return v4 +} +; run + +;function %sub_sat_i8x16() -> b1 { +;block0: +; v0 = vconst.i8x16 [128 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] ; 128 == 0x80 == -128 +; v1 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] +; +; v2 = ssub_sat v0, v1 +; v3 = extractlane v2, 0 +; v4 = icmp_imm eq v3, 0x80 ; 0x80 == -128 +; +; ; now re-use 0x80 as an unsigned 128 +; v5 = usub_sat v0, v2 +; v6 = extractlane v5, 0 +; v7 = icmp_imm eq v6, 0 +; +; v8 = band v4, v7 +; return v8 +;} +; _run + +;function %add_sub_f32x4() -> b1 { +;block0: +; v0 = vconst.f32x4 [0x4.2 0.0 0.0 0.0] +; v1 = vconst.f32x4 [0x1.0 0x1.0 0x1.0 0x1.0] +; v2 = vconst.f32x4 [0x5.2 0x1.0 0x1.0 0x1.0] +; +; v3 = fadd v0, v1 +; v4 = fcmp eq v3, v2 +; +; v6 = fsub v2, v1 +; v7 = fcmp eq v6, v0 +; +; v8 = band v4, v7 +; v9 = vall_true v8 +; return v9 +;} +; _run + +;function %mul_div_f32x4() -> b1 { +;block0: +; v0 = vconst.f32x4 [0x4.2 -0x2.1 0x2.0 0.0] +; v1 = vconst.f32x4 [0x3.4 0x6.7 0x8.9 0xa.b] +; v2 = vconst.f32x4 [0xd.68 -0xd.47 0x11.2 0x0.0] +; +; v3 = fmul v0, v1 +; v4 = fcmp eq v3, v2 +; +; v6 = fdiv v2, v1 +; v7 = fcmp eq v6, v0 +; +; v8 = band v4, v7 +; v9 = vall_true v8 +; return v9 +;} +; _run + +;function %sqrt_f64x2() -> b1 { +;block0: +; v0 = vconst.f64x2 [0x9.0 0x1.0] +; v1 = sqrt v0 +; v2 = vconst.f64x2 [0x3.0 0x1.0] +; v3 = fcmp eq v2, v1 +; v4 = vall_true v3 +; return v4 +;} +; _run + +function %fmax_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmax v0, v1 + return v2 +} +; note below how NaNs are quieted but (unlike fmin), retain their sign: this discrepancy is allowed by non-determinism +; in the spec, see https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0. +; run: %fmax_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [+0x0.0 0x1.0] +; run: %fmax_f64x2([-NaN NaN], [0x0.0 0x100.0]) == [-NaN NaN] +; run: %fmax_f64x2([NaN 0.0], [0.0 0.0]) == [NaN 0.0] +; run: %fmax_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0] +; run: %fmax_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [NaN 0.0] + +function %fmin_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fmin v0, v1 + return v2 +} +; note below how NaNs are quieted and negative: this is due to non-determinism in the spec for NaNs, see +; https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0. +; run: %fmin_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [-0x0.0 -0x1.0] +; run: %fmin_f64x2([-NaN 0x100.0], [0.0 NaN]) == [-NaN -NaN] +; run: %fmin_f64x2([NaN 0.0], [0.0 0.0]) == [-NaN 0.0] +; run: %fmin_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0] +; run: %fmin_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [-NaN 0.0] + +;function %fneg_f64x2() -> b1 { +;block0: +; v0 = vconst.f64x2 [0x1.0 -0x1.0] +; v1 = fneg v0 +; +; v2 = vconst.f64x2 [-0x1.0 0x1.0] +; v3 = fcmp eq v1, v2 +; v4 = vall_true v3 +; +; return v4 +;} +; _run + +;function %fneg_f32x4() -> b1 { +;block0: +; v0 = vconst.f32x4 [0x0.0 -0x0.0 -Inf Inf] +; v1 = fneg v0 +; +; v2 = vconst.f32x4 [-0x0.0 0x0.0 Inf -Inf] +; v3 = fcmp eq v1, v2 +; v4 = vall_true v3 +; +; return v4 +;} +; _run + +;function %fabs_f32x4() -> b1 { +;block0: +; v0 = vconst.f32x4 [0x0.0 -0x1.0 0x2.0 -0x3.0] +; v1 = fabs v0 +; +; v2 = vconst.f32x4 [0x0.0 0x1.0 0x2.0 0x3.0] +; v3 = fcmp eq v1, v2 +; v4 = vall_true v3 +; +; return v4 +;} +; _run + +;function %average_rounding_i16x8() -> b1 { +;block0: +; v0 = vconst.i16x8 [0 0 0 1 42 19 -1 0xffff] +; v1 = vconst.i16x8 [0 1 2 4 42 18 -1 0] +; v2 = vconst.i16x8 [0 1 1 3 42 19 -1 0x8000] +; +; v3 = avg_round v0, v1 +; v4 = icmp eq v2, v3 +; v5 = vall_true v4 +; +; return v5 +;} +; _run + +function %iabs(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iabs v0 + return v1 +} +; run: %iabs([-42 -1 0 1]) == [42 1 0 1]