From dd40bf075a943d74841f0276735b26762af890a6 Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Mon, 25 Jul 2022 21:37:06 +0200 Subject: [PATCH] s390x: Enable more runtests, and fix a few bugs (#4516) This enables more runtests to be executed on s390x. Doing so uncovered a two back-end bugs, which are fixed as well: - The result of cls was always off by one. - The result of popcnt.i16 has uninitialized high bits. In addition, I found a bug in the load-op-store.clif test case: v3 = heap_addr.i64 heap0, v1, 4 v4 = iconst.i64 42 store.i32 v4, v3 This was clearly intended to perform a 32-bit store, but actually performs a 64-bit store (it seems the type annotation of the store opcode is ignored, and the type of the operand is used instead). That bug did not show any noticable symptoms on little-endian architectures, but broke on big-endian. --- cranelift/codegen/src/isa/s390x/lower.isle | 21 ++++++++++++++----- .../filetests/filetests/isa/s390x/bitops.clif | 14 ++++++------- .../filetests/filetests/runtests/bint.clif | 1 + .../filetests/filetests/runtests/cls.clif | 1 + .../filetests/filetests/runtests/clz.clif | 1 + .../filetests/runtests/conversions.clif | 1 + .../filetests/filetests/runtests/ctz.clif | 1 + .../filetests/filetests/runtests/iabs.clif | 3 ++- .../filetests/runtests/icmp-ugt.clif | 1 + .../filetests/filetests/runtests/ireduce.clif | 1 + .../filetests/runtests/load-op-store.clif | 3 ++- .../filetests/filetests/runtests/popcnt.clif | 1 + .../filetests/filetests/runtests/select.clif | 1 + .../runtests/simd-scalartovector-aarch64.clif | 3 ++- .../runtests/simd-scalartovector.clif | 1 + .../filetests/runtests/smulhi-aarch64.clif | 3 ++- .../filetests/filetests/runtests/smulhi.clif | 1 + 17 files changed, 42 insertions(+), 16 deletions(-) diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 0685d7e653..1468c60b2a 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -1149,17 +1149,26 @@ ;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; The result of cls is not supposed to count the sign bit itself, just +;; additional copies of it. Therefore, when computing cls in terms of clz, +;; we need to subtract one. Fold this into the offset computation. +(decl cls_offset (Type Reg) Reg) +(rule (cls_offset $I8 x) (add_simm16 $I8 x -57)) +(rule (cls_offset $I16 x) (add_simm16 $I16 x -49)) +(rule (cls_offset $I32 x) (add_simm16 $I32 x -33)) +(rule (cls_offset $I64 x) (add_simm16 $I64 x -1)) + ;; Count leading sign-bit copies. We don't have any instruction for that, ;; so we instead count the leading zeros after inverting the input if negative, ;; i.e. computing -;; cls(x) == clz(x ^ (x >> 63)) +;; cls(x) == clz(x ^ (x >> 63)) - 1 ;; where x is the sign-extended input. (rule (lower (has_type (fits_in_64 ty) (cls x))) (let ((ext_reg Reg (put_in_reg_sext64 x)) (signbit_copies Reg (ashr_imm $I64 ext_reg 63)) (inv_reg Reg (xor_reg $I64 ext_reg signbit_copies)) (clz RegPair (clz_reg 64 inv_reg))) - (clz_offset ty (regpair_hi clz)))) + (cls_offset ty (regpair_hi clz)))) ;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1214,12 +1223,14 @@ ;; of each input byte separately, so we need to accumulate those partial ;; results via a series of log2(type size in bytes) - 1 additions. We ;; accumulate in the high byte, so that a final right shift will zero out -;; any unrelated bits to give a clean result. +;; any unrelated bits to give a clean result. (This does not work with +;; $I16, where we instead accumulate in the low byte and clear high bits +;; via an explicit and operation.) (rule (lower (has_type (and (mie2_disabled) $I16) (popcnt x))) (let ((cnt2 Reg (popcnt_byte x)) - (cnt1 Reg (add_reg $I32 cnt2 (lshl_imm $I32 cnt2 8)))) - (lshr_imm $I32 cnt1 8))) + (cnt1 Reg (add_reg $I32 cnt2 (lshr_imm $I32 cnt2 8)))) + (and_uimm16shifted $I32 cnt1 (uimm16shifted 255 0)))) (rule (lower (has_type (and (mie2_disabled) $I32) (popcnt x))) (let ((cnt4 Reg (popcnt_byte x)) diff --git a/cranelift/filetests/filetests/isa/s390x/bitops.clif b/cranelift/filetests/filetests/isa/s390x/bitops.clif index 444b5242e1..3aa4b12dec 100644 --- a/cranelift/filetests/filetests/isa/s390x/bitops.clif +++ b/cranelift/filetests/filetests/isa/s390x/bitops.clif @@ -93,7 +93,7 @@ block0(v0: i64): ; srag %r5, %r2, 63 ; xgrk %r3, %r2, %r5 ; flogr %r0, %r3 -; lgr %r2, %r0 +; aghik %r2, %r0, -1 ; br %r14 function %cls_i32(i32) -> i32 { @@ -107,7 +107,7 @@ block0(v0: i32): ; srag %r3, %r5, 63 ; xgr %r5, %r3 ; flogr %r0, %r5 -; ahik %r2, %r0, -32 +; ahik %r2, %r0, -33 ; br %r14 function %cls_i16(i16) -> i16 { @@ -121,7 +121,7 @@ block0(v0: i16): ; srag %r3, %r5, 63 ; xgr %r5, %r3 ; flogr %r0, %r5 -; ahik %r2, %r0, -48 +; ahik %r2, %r0, -49 ; br %r14 function %cls_i8(i8) -> i8 { @@ -135,7 +135,7 @@ block0(v0: i8): ; srag %r3, %r5, 63 ; xgr %r5, %r3 ; flogr %r0, %r5 -; ahik %r2, %r0, -56 +; ahik %r2, %r0, -57 ; br %r14 function %ctz_i64(i64) -> i64 { @@ -238,9 +238,9 @@ block0(v0: i16): ; block0: ; popcnt %r5, %r2 -; sllk %r3, %r5, 8 -; ar %r5, %r3 -; srlk %r2, %r5, 8 +; srlk %r3, %r5, 8 +; ark %r2, %r5, %r3 +; nill %r2, 255 ; br %r14 function %popcnt_i8(i8) -> i8 { diff --git a/cranelift/filetests/filetests/runtests/bint.clif b/cranelift/filetests/filetests/runtests/bint.clif index f107d12648..996b635312 100644 --- a/cranelift/filetests/filetests/runtests/bint.clif +++ b/cranelift/filetests/filetests/runtests/bint.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 function %bint_b1_i8_true() -> i8 { diff --git a/cranelift/filetests/filetests/runtests/cls.clif b/cranelift/filetests/filetests/runtests/cls.clif index 00411c33d2..fdd937bd35 100644 --- a/cranelift/filetests/filetests/runtests/cls.clif +++ b/cranelift/filetests/filetests/runtests/cls.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x ; not implemented on `x86_64` function %cls_i8(i8) -> i8 { diff --git a/cranelift/filetests/filetests/runtests/clz.clif b/cranelift/filetests/filetests/runtests/clz.clif index a6f7dde54b..dced407b74 100644 --- a/cranelift/filetests/filetests/runtests/clz.clif +++ b/cranelift/filetests/filetests/runtests/clz.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 target x86_64 has_lzcnt diff --git a/cranelift/filetests/filetests/runtests/conversions.clif b/cranelift/filetests/filetests/runtests/conversions.clif index 42becd8b00..38ba57c7ab 100644 --- a/cranelift/filetests/filetests/runtests/conversions.clif +++ b/cranelift/filetests/filetests/runtests/conversions.clif @@ -1,6 +1,7 @@ test run target x86_64 +target s390x target aarch64 function %fpromote_f32_f64(i64 vmctx, i64, f32) -> f64 { diff --git a/cranelift/filetests/filetests/runtests/ctz.clif b/cranelift/filetests/filetests/runtests/ctz.clif index a7776d1897..5f8f7023da 100644 --- a/cranelift/filetests/filetests/runtests/ctz.clif +++ b/cranelift/filetests/filetests/runtests/ctz.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 target x86_64 has_bmi1 diff --git a/cranelift/filetests/filetests/runtests/iabs.clif b/cranelift/filetests/filetests/runtests/iabs.clif index 8781744ece..f5552c30ec 100644 --- a/cranelift/filetests/filetests/runtests/iabs.clif +++ b/cranelift/filetests/filetests/runtests/iabs.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x ; x86_64 only supports vector iabs function %iabs_i8(i8) -> i8 { @@ -41,4 +42,4 @@ block0(v0: i64): ; run: %iabs_i64(0) == 0 ; run: %iabs_i64(9223372036854775807) == 9223372036854775807 ; run: %iabs_i64(-9223372036854775807) == 9223372036854775807 -; run: %iabs_i64(-9223372036854775808) == -9223372036854775808 \ No newline at end of file +; run: %iabs_i64(-9223372036854775808) == -9223372036854775808 diff --git a/cranelift/filetests/filetests/runtests/icmp-ugt.clif b/cranelift/filetests/filetests/runtests/icmp-ugt.clif index 5e99ec4ff2..76d67e0cba 100644 --- a/cranelift/filetests/filetests/runtests/icmp-ugt.clif +++ b/cranelift/filetests/filetests/runtests/icmp-ugt.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 function %icmp_ugt_i8(i8, i8) -> b1 { diff --git a/cranelift/filetests/filetests/runtests/ireduce.clif b/cranelift/filetests/filetests/runtests/ireduce.clif index 292bc14f2c..b103cbb5b8 100644 --- a/cranelift/filetests/filetests/runtests/ireduce.clif +++ b/cranelift/filetests/filetests/runtests/ireduce.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 function %ireduce_i16_i8(i16) -> i8 { diff --git a/cranelift/filetests/filetests/runtests/load-op-store.clif b/cranelift/filetests/filetests/runtests/load-op-store.clif index fd6a8b8291..ebf692b447 100644 --- a/cranelift/filetests/filetests/runtests/load-op-store.clif +++ b/cranelift/filetests/filetests/runtests/load-op-store.clif @@ -1,5 +1,6 @@ test run target x86_64 +target s390x target aarch64 function %load_op_store_iadd_i64(i64 vmctx, i64, i64) -> i64 { @@ -28,7 +29,7 @@ function %load_op_store_iadd_i32(i64 vmctx, i64, i32) -> i32 { block0(v0: i64, v1: i64, v2: i32): v3 = heap_addr.i64 heap0, v1, 4 - v4 = iconst.i64 42 + v4 = iconst.i32 42 store.i32 v4, v3 v5 = load.i32 v3 v6 = iadd.i32 v5, v2 diff --git a/cranelift/filetests/filetests/runtests/popcnt.clif b/cranelift/filetests/filetests/runtests/popcnt.clif index 1afcca329a..560031d4de 100644 --- a/cranelift/filetests/filetests/runtests/popcnt.clif +++ b/cranelift/filetests/filetests/runtests/popcnt.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x target x86_64 target x86_64 has_popcnt diff --git a/cranelift/filetests/filetests/runtests/select.clif b/cranelift/filetests/filetests/runtests/select.clif index 6356b5226c..0aa575af1f 100644 --- a/cranelift/filetests/filetests/runtests/select.clif +++ b/cranelift/filetests/filetests/runtests/select.clif @@ -1,5 +1,6 @@ test interpret test run +target s390x target x86_64 function %select_eq_f32(f32, f32) -> i32 { diff --git a/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif index 5a458f7de6..a49b525260 100644 --- a/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif +++ b/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif @@ -1,5 +1,6 @@ test run target aarch64 +target s390x ; i8 and i16 are invalid source sizes for x86_64 function %scalartovector_i8(i8) -> i8x16 { @@ -16,4 +17,4 @@ block0(v0: i16): return v1 } ; run: %scalartovector_i16(1) == [1 0 0 0 0 0 0 0] -; run: %scalartovector_i16(65535) == [65535 0 0 0 0 0 0 0] \ No newline at end of file +; run: %scalartovector_i16(65535) == [65535 0 0 0 0 0 0 0] diff --git a/cranelift/filetests/filetests/runtests/simd-scalartovector.clif b/cranelift/filetests/filetests/runtests/simd-scalartovector.clif index 37d726f8bf..737f70866c 100644 --- a/cranelift/filetests/filetests/runtests/simd-scalartovector.clif +++ b/cranelift/filetests/filetests/runtests/simd-scalartovector.clif @@ -1,5 +1,6 @@ test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41 diff --git a/cranelift/filetests/filetests/runtests/smulhi-aarch64.clif b/cranelift/filetests/filetests/runtests/smulhi-aarch64.clif index b123f2f0c9..031602552e 100644 --- a/cranelift/filetests/filetests/runtests/smulhi-aarch64.clif +++ b/cranelift/filetests/filetests/runtests/smulhi-aarch64.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x ; x86_64 backend only supports `i16`, `i32`, and `i64` types. function %smulhi_i8(i8, i8) -> i8 { @@ -10,4 +11,4 @@ block0(v0: i8, v1: i8): } ; run: %smulhi_i8(-2, -4) == 0 ; run: %smulhi_i8(2, -4) == -1 -; run: %smulhi_i8(127, 127) == 63 \ No newline at end of file +; run: %smulhi_i8(127, 127) == 63 diff --git a/cranelift/filetests/filetests/runtests/smulhi.clif b/cranelift/filetests/filetests/runtests/smulhi.clif index 306ee70913..979ee2588e 100644 --- a/cranelift/filetests/filetests/runtests/smulhi.clif +++ b/cranelift/filetests/filetests/runtests/smulhi.clif @@ -1,6 +1,7 @@ test interpret test run target aarch64 +target s390x set enable_simd target x86_64 has_sse3 has_ssse3 has_sse41