Support big- and little-endian lane order with bitcast (#5196)

Add a MemFlags operand to the bitcast instruction, where only the `big` and `little` flags are accepted. These define the lane order to be used when casting between types of different lane counts. Update all users to pass an appropriate MemFlags argument. Implement lane swaps where necessary in the s390x back-end. This is the final part necessary to fix https://github.com/bytecodealliance/wasmtime/issues/4566.
2022-11-07 23:41:10 +01:00
parent 5cef53537b
commit 3e5938e65a
16 changed files with 295 additions and 51 deletions
--- a/cranelift/filetests/filetests/isa/s390x/bitcast.clif
+++ b/cranelift/filetests/filetests/isa/s390x/bitcast.clif
@@ -0,0 +1,79 @@
+test compile precise-output
+target s390x
+
+;; Bitcast between integral types is a no-op.
+
+function %bitcast_i8_i8(i8) -> i8 {
+block0(v0: i8):
+  v1 = bitcast.i8 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i16_i16(i16) -> i16 {
+block0(v0: i16):
+  v1 = bitcast.i16 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i32_i32(i32) -> i32 {
+block0(v0: i32):
+  v1 = bitcast.i32 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64_i64(i64) -> i64 {
+block0(v0: i64):
+  v1 = bitcast.i64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i128_i128(i128) -> i128 {
+block0(v0: i128):
+  v1 = bitcast.i128 v0
+  return v1
+}
+
+; block0:
+;   vl %v0, 0(%r3)
+;   vst %v0, 0(%r2)
+;   br %r14
+
+function %bitcast_r64_i64(r64) -> i64 {
+block0(v0: r64):
+  v1 = bitcast.i64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64_r64(i64) -> r64 {
+block0(v0: i64):
+  v1 = bitcast.r64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_r64_r64(r64) -> r64 {
+block0(v0: r64):
+  v1 = bitcast.r64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
--- a/cranelift/filetests/filetests/isa/s390x/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/s390x/floating-point.clif
@@ -1200,3 +1200,21 @@ block0(v0: f32):
 ;   vlgvf %r2, %v0, 0
 ;   br %r14

+function %bitcast_f32_f32(f32) -> f32 {
+block0(v0: f32):
+  v1 = bitcast.f32 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_f64_f64(f64) -> f64 {
+block0(v0: f64):
+  v1 = bitcast.f64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
--- a/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif
+++ b/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif
@@ -0,0 +1,76 @@
+test compile precise-output
+target s390x
+
+;; Vector bitcast is a no-op if the lane count remains unchanged,
+;; or if the ABI lane-order matches the specified byte order.
+;; Otherwise, lane-swaps need to happen.
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 big v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 little v0
+  return v1
+}
+
+; block0:
+;   vpdi %v3, %v24, %v24, 4
+;   vpdi %v5, %v3, %v3, 4
+;   verllg %v24, %v5, 32
+;   br %r14
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 wasmtime_system_v {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 big v0
+  return v1
+}
+
+; block0:
+;   vpdi %v3, %v24, %v24, 4
+;   vpdi %v5, %v3, %v3, 4
+;   verllg %v24, %v5, 32
+;   br %r14
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 wasmtime_system_v {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 little v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_f64x2(i64x2) -> f64x2 {
+block0(v0: i64x2):
+  v1 = bitcast.f64x2 big v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_f64x2(i64x2) -> f64x2 {
+block0(v0: i64x2):
+  v1 = bitcast.f64x2 little v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_f64x2(i64x2) -> f64x2 wasmtime_system_v {
+block0(v0: i64x2):
+  v1 = bitcast.f64x2 big v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
--- a/cranelift/filetests/filetests/isa/x64/move-elision.clif
+++ b/cranelift/filetests/filetests/isa/x64/move-elision.clif
@@ -7,9 +7,9 @@ block0(v0: i32x4):
    ;; In the x64 backend, all of these pseudo-instructions are lowered to moves between registers (e.g. MOVAPD, MOVDQA,
    ;; etc.). Because these have been marked as moves, no instructions are emitted by this function besides the prologue
    ;; and epilogue.
-    v1 = bitcast.f32x4 v0
-    v2 = bitcast.f64x2 v1
-    v3 = bitcast.i8x16 v2
+    v1 = bitcast.f32x4 little v0
+    v2 = bitcast.f64x2 little v1
+    v3 = bitcast.i8x16 little v2
    return v3
 }

--- a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
@@ -43,7 +43,7 @@ block0(v0: f32x4, v1: f32x4, v2: i32x4, v3: i32x4):

 function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
 block0(v0: i8x16, v1: i8x16, v2: i32x4):
-    v3 = bitcast.i8x16 v2
+    v3 = bitcast.i8x16 little v2
    v4 = bitselect v3, v0, v1
    return v4
 }
--- a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif
@@ -16,7 +16,7 @@ block0(v0: i32x4, v1: i32x4):

 function %mask_casted(i64x2, i64x2, i32x4) -> i64x2 {
 block0(v0: i64x2, v1: i64x2, v2: i32x4):
-    v3 = bitcast.i64x2 v2
+    v3 = bitcast.i64x2 little v2
    v4 = bitselect v3, v0, v1
    return v4
 }
--- a/cranelift/filetests/filetests/runtests/simd-lane-access.clif
+++ b/cranelift/filetests/filetests/runtests/simd-lane-access.clif
@@ -26,10 +26,10 @@ block0:
 function %shuffle_i32x4_in_same_place() -> i32x4 {
 block0:
    v1 = vconst.i32x4 [0 1 2 3]
-    v2 = bitcast.i8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
+    v2 = bitcast.i8x16 little v1 ; we have to cast because shuffle is type-limited to Tx16
    ; keep each lane in place from the first vector
    v3 = shuffle v2, v2, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
-    v4 = bitcast.i32x4 v3
+    v4 = bitcast.i32x4 little v3
    return v4
 }
 ; run: %shuffle_in_same_place() == [0 1 2 3]
@@ -37,10 +37,10 @@ block0:
 function %shuffle_i32x4_to_all_true() -> i32x4 {
 block0:
    v1 = vconst.i32x4 [-1 0 -1 0]
-    v2 = bitcast.i8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
+    v2 = bitcast.i8x16 little v1 ; we have to cast because shuffle is type-limited to Tx16
    ; pair up the true values to make the entire vector true
    v3 = shuffle v2, v2, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
-    v4 = bitcast.i32x4 v3 ; TODO store.i32x4 is unavailable; see https://github.com/bytecodealliance/wasmtime/issues/2237
+    v4 = bitcast.i32x4 little v3 ; TODO store.i32x4 is unavailable; see https://github.com/bytecodealliance/wasmtime/issues/2237
    return v4
 }
 ; run: %shuffle_i32x4_to_all_true() == [0xffffffff 0xffffffff 0xffffffff 0xffffffff]
--- a/cranelift/filetests/filetests/verifier/bitcast.clif
+++ b/cranelift/filetests/filetests/verifier/bitcast.clif
@@ -21,3 +21,34 @@ block0(v0: i64):
    return v1
 }

+; "little"/"big" flag modifier is ok
+function %bitcast_little(i32) -> f32 { ; Ok
+block0(v0: i32):
+    v1 = bitcast.f32 little v0
+    return v1
+}
+function %bitcast_big(i32) -> f32 { ; Ok
+block0(v0: i32):
+    v1 = bitcast.f32 big v0
+    return v1
+}
+
+; other flag modifiers are not ok
+function %bitcast_big(i32) -> f32 {
+block0(v0: i32):
+    v1 = bitcast.f32 notrap v0 ;  error: The bitcast instruction only accepts the `big` or `little` memory flags
+    return v1
+}
+function %bitcast_big(i32) -> f32 {
+block0(v0: i32):
+    v1 = bitcast.f32 aligned v0 ;  error: The bitcast instruction only accepts the `big` or `little` memory flags
+    return v1
+}
+
+; if lane counts differ, a byte order specifier is required
+function %bitcast_lanes(i32x4) -> i64x2 {
+block0(v0: i32x4):
+    v1 = bitcast.i64x2 v0 ;  error: Byte order specifier required for bitcast instruction changing lane count
+    return v1
+}
+