Support big- and little-endian lane order with bitcast (#5196)

Add a MemFlags operand to the bitcast instruction, where only the `big` and `little` flags are accepted. These define the lane order to be used when casting between types of different lane counts. Update all users to pass an appropriate MemFlags argument. Implement lane swaps where necessary in the s390x back-end. This is the final part necessary to fix https://github.com/bytecodealliance/wasmtime/issues/4566.
2022-11-07 23:41:10 +01:00
parent 5cef53537b
commit 3e5938e65a
16 changed files with 295 additions and 51 deletions
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3104,6 +3104,7 @@ pub(crate) fn define(
    let x = &Operand::new("x", Mem);
    let a = &Operand::new("a", MemTo).with_doc("Bits of `x` reinterpreted");
    let MemFlags = &Operand::new("MemFlags", &imm.memflags);
    ig.push(
        Inst::new(
@@ -3113,11 +3114,16 @@ pub(crate) fn define(
        The input and output types must be storable to memory and of the same
        size. A bitcast is equivalent to storing one type and loading the other
-        type from the same address.
+        type from the same address, both using the specified MemFlags.
        Note that this operation only supports the `big` or `little` MemFlags.
        The specified byte order only affects the result in the case where
        input and output types differ in lane count/size.  In this case, the
        operation is only valid if a byte order specifier is provided.
        "#,
-            &formats.unary,
+            &formats.load_no_offset,
        )
-        .operands_in(vec![x])
+        .operands_in(vec![MemFlags, x])
        .operands_out(vec![a]),
    );
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -2197,25 +2197,25 @@
 ;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; SIMD&FP <=> SIMD&FP
-(rule 5 (lower (has_type (ty_float_or_vec _) (bitcast x @ (value_type (ty_float_or_vec _)))))
+(rule 5 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type (ty_float_or_vec _)))))
      x)
 ; GPR => SIMD&FP
-(rule 4 (lower (has_type (ty_float_or_vec _) (bitcast x @ (value_type in_ty))))
+(rule 4 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type in_ty))))
      (if (ty_int_ref_scalar_64 in_ty))
      (mov_to_fpu x (scalar_size in_ty)))
 ; SIMD&FP => GPR
-(rule 3 (lower (has_type out_ty (bitcast x @ (value_type (fits_in_64 (ty_float_or_vec _))))))
+(rule 3 (lower (has_type out_ty (bitcast _ x @ (value_type (fits_in_64 (ty_float_or_vec _))))))
      (if (ty_int_ref_scalar_64 out_ty))
      (mov_from_vec x 0 (scalar_size out_ty)))
 ; GPR <=> GPR
-(rule 2 (lower (has_type out_ty (bitcast x @ (value_type in_ty))))
+(rule 2 (lower (has_type out_ty (bitcast _ x @ (value_type in_ty))))
      (if (ty_int_ref_scalar_64 out_ty))
      (if (ty_int_ref_scalar_64 in_ty))
      x)
-(rule 1 (lower (has_type $I128 (bitcast x @ (value_type $I128)))) x)
+(rule 1 (lower (has_type $I128 (bitcast _ x @ (value_type $I128)))) x)
 ;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -811,7 +811,7 @@
 )
 ;;;;;  Rules for `bitcast`;;;;;;;;;
 (rule
-   (lower (has_type out (bitcast v @ (value_type in_ty))))
+   (lower (has_type out (bitcast _ v @ (value_type in_ty))))
   (gen_moves v in_ty out))
 ;;;;;  Rules for `ceil`;;;;;;;;;
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@@ -1486,6 +1486,11 @@
 (rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.LittleEndian)) $false)
 (rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.BigEndian)) $true)
 ;; Return lane order matching memory byte order.
 (decl pure lane_order_from_memflags (MemFlags) LaneOrder)
 (rule 0 (lane_order_from_memflags (littleendian)) (LaneOrder.LittleEndian))
 (rule 1 (lane_order_from_memflags (bigendian)) (LaneOrder.BigEndian))
 ;; Convert a CLIF immediate lane index value to big-endian lane order.
 (decl be_lane_idx (Type u8) u8)
 (extern constructor be_lane_idx be_lane_idx)
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -1738,40 +1738,46 @@
 ;;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Reinterpret a 64-bit integer value as floating-point.
-(rule (lower (has_type $F64 (bitcast x @ (value_type $I64))))
+(rule (lower (has_type $F64 (bitcast _ x @ (value_type $I64))))
      (vec_insert_lane_undef $F64X2 x 0 (zero_reg)))
 ;; Reinterpret a 64-bit floating-point value as integer.
-(rule (lower (has_type $I64 (bitcast x @ (value_type $F64))))
+(rule (lower (has_type $I64 (bitcast _ x @ (value_type $F64))))
      (vec_extract_lane $F64X2 x 0 (zero_reg)))
 ;; Reinterpret a 32-bit integer value as floating-point.
-(rule (lower (has_type $F32 (bitcast x @ (value_type $I32))))
+(rule (lower (has_type $F32 (bitcast _ x @ (value_type $I32))))
      (vec_insert_lane_undef $F32X4 x 0 (zero_reg)))
 ;; Reinterpret a 32-bit floating-point value as integer.
-(rule (lower (has_type $I32 (bitcast x @ (value_type $F32))))
+(rule (lower (has_type $I32 (bitcast _ x @ (value_type $F32))))
      (vec_extract_lane $F32X4 x 0 (zero_reg)))
 ;; Bitcast between types residing in GPRs is a no-op.
 (rule 1 (lower (has_type (gpr32_ty _)
-                         (bitcast x @ (value_type (gpr32_ty _))))) x)
+                         (bitcast _ x @ (value_type (gpr32_ty _))))) x)
 (rule 2 (lower (has_type (gpr64_ty _)
-                         (bitcast x @ (value_type (gpr64_ty _))))) x)
+                         (bitcast _ x @ (value_type (gpr64_ty _))))) x)
 ;; Bitcast between types residing in FPRs is a no-op.
 (rule 3 (lower (has_type (ty_scalar_float _)
-                         (bitcast x @ (value_type (ty_scalar_float _))))) x)
+                         (bitcast _ x @ (value_type (ty_scalar_float _))))) x)
-;; Bitcast between types residing in VRs is a no-op.
+;; Bitcast between types residing in VRs is a no-op if lane count is unchanged.
-;; FIXME: There are two flavors of vector bitcast, which are currently not
+(rule 5 (lower (has_type (multi_lane bits count)
-;; distinguished in CLIF IR.  Those generated by Wasmtime assume little-endian
+                         (bitcast _ x @ (value_type (multi_lane bits count))))) x)
-;; lane order, and those generated elsewhere assume big-endian lane order.
+
-;; Bitcast is a no-op if current lane order matches that assumed lane order.
+;; Bitcast between types residing in VRs with different lane counts is a
-;; However, due to our choice of lane order depending on the current function
+;; no-op if the operation's MemFlags indicate a byte order compatible with
-;; ABI, every bitcast we currently see here is indeed a no-op.
+;; the current lane order.  Otherwise, lane elements need to be swapped,
-(rule 4 (lower (has_type (vr128_ty _)
+;; first in the input type, and then again in the output type.  This could
-                         (bitcast x @ (value_type (vr128_ty _))))) x)
+;; be optimized further, but we don't bother at the moment since due to our
 ;; choice of lane order depending on the current function ABI, this case will
 ;; currently never arise in practice.
 (rule 4 (lower (has_type (vr128_ty out_ty)
                         (bitcast flags x @ (value_type (vr128_ty in_ty)))))
      (abi_vec_elt_rev (lane_order_from_memflags flags) out_ty
        (abi_vec_elt_rev (lane_order_from_memflags flags) in_ty x)))
 ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3298,25 +3298,25 @@
 ;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule (lower (has_type $I32 (bitcast src @ (value_type $F32))))
+(rule (lower (has_type $I32 (bitcast _ src @ (value_type $F32))))
      (bitcast_xmm_to_gpr $F32 src))
-(rule (lower (has_type $F32 (bitcast src @ (value_type $I32))))
+(rule (lower (has_type $F32 (bitcast _ src @ (value_type $I32))))
      (bitcast_gpr_to_xmm $I32 src))
-(rule (lower (has_type $I64 (bitcast src @ (value_type $F64))))
+(rule (lower (has_type $I64 (bitcast _ src @ (value_type $F64))))
      (bitcast_xmm_to_gpr $F64 src))
-(rule (lower (has_type $F64 (bitcast src @ (value_type $I64))))
+(rule (lower (has_type $F64 (bitcast _ src @ (value_type $I64))))
      (bitcast_gpr_to_xmm $I64 src))
 ;; Bitcast between types residing in GPR registers is a no-op.
 (rule 1 (lower (has_type (is_gpr_type _)
-                         (bitcast x @ (value_type (is_gpr_type _))))) x)
+                         (bitcast _ x @ (value_type (is_gpr_type _))))) x)
 ;; Bitcast between types residing in XMM registers is a no-op.
 (rule 2 (lower (has_type (is_xmm_type _)
-                         (bitcast x @ (value_type (is_xmm_type _))))) x)
+                         (bitcast _ x @ (value_type (is_xmm_type _))))) x)
 ;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/cranelift/codegen/src/verifier/mod.rs
+++ b/cranelift/codegen/src/verifier/mod.rs
@@ -67,7 +67,7 @@ use crate::ir::entities::AnyEntity;
 use crate::ir::instructions::{BranchInfo, CallInfo, InstructionFormat, ResolvedConstraint};
 use crate::ir::{
    types, ArgumentPurpose, Block, Constant, DynamicStackSlot, FuncRef, Function, GlobalValue,
-    Inst, JumpTable, Opcode, SigRef, StackSlot, Type, Value, ValueDef, ValueList,
+    Inst, JumpTable, MemFlags, Opcode, SigRef, StackSlot, Type, Value, ValueDef, ValueList,
 };
 use crate::isa::TargetIsa;
 use crate::iterators::IteratorExtras;
@@ -729,11 +729,12 @@ impl<'a> Verifier<'a> {
                    ));
                }
            }
-            Unary {
+            LoadNoOffset {
                opcode: Opcode::Bitcast,
                flags,
                arg,
            } => {
-                self.verify_bitcast(inst, arg, errors)?;
+                self.verify_bitcast(inst, flags, arg, errors)?;
            }
            UnaryConst {
                opcode: Opcode::Vconst,
@@ -1070,6 +1071,7 @@ impl<'a> Verifier<'a> {
    fn verify_bitcast(
        &self,
        inst: Inst,
        flags: MemFlags,
        arg: Value,
        errors: &mut VerifierErrors,
    ) -> VerifierStepResult<()> {
@@ -1086,6 +1088,19 @@ impl<'a> Verifier<'a> {
                    typ.bits()
                ),
            ))
        } else if flags != MemFlags::new()
            && flags != MemFlags::new().with_endianness(ir::Endianness::Little)
            && flags != MemFlags::new().with_endianness(ir::Endianness::Big)
        {
            errors.fatal((
                inst,
                "The bitcast instruction only accepts the `big` or `little` memory flags",
            ))
        } else if flags == MemFlags::new() && typ.lane_count() != value_type.lane_count() {
            errors.fatal((
                inst,
                "Byte order specifier required for bitcast instruction changing lane count",
            ))
        } else {
            Ok(())
        }
--- a/cranelift/filetests/filetests/isa/s390x/bitcast.clif
+++ b/cranelift/filetests/filetests/isa/s390x/bitcast.clif
@@ -0,0 +1,79 @@
 test compile precise-output
 target s390x
 ;; Bitcast between integral types is a no-op.
 function %bitcast_i8_i8(i8) -> i8 {
 block0(v0: i8):
  v1 = bitcast.i8 v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_i16_i16(i16) -> i16 {
 block0(v0: i16):
  v1 = bitcast.i16 v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_i32_i32(i32) -> i32 {
 block0(v0: i32):
  v1 = bitcast.i32 v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_i64_i64(i64) -> i64 {
 block0(v0: i64):
  v1 = bitcast.i64 v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_i128_i128(i128) -> i128 {
 block0(v0: i128):
  v1 = bitcast.i128 v0
  return v1
 }
 ; block0:
 ;   vl %v0, 0(%r3)
 ;   vst %v0, 0(%r2)
 ;   br %r14
 function %bitcast_r64_i64(r64) -> i64 {
 block0(v0: r64):
  v1 = bitcast.i64 v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_i64_r64(i64) -> r64 {
 block0(v0: i64):
  v1 = bitcast.r64 v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_r64_r64(r64) -> r64 {
 block0(v0: r64):
  v1 = bitcast.r64 v0
  return v1
 }
 ; block0:
 ;   br %r14
--- a/cranelift/filetests/filetests/isa/s390x/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/s390x/floating-point.clif
@@ -1200,3 +1200,21 @@ block0(v0: f32):
 ;   vlgvf %r2, %v0, 0
 ;   br %r14
 function %bitcast_f32_f32(f32) -> f32 {
 block0(v0: f32):
  v1 = bitcast.f32 v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_f64_f64(f64) -> f64 {
 block0(v0: f64):
  v1 = bitcast.f64 v0
  return v1
 }
 ; block0:
 ;   br %r14
--- a/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif
+++ b/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif
@@ -0,0 +1,76 @@
 test compile precise-output
 target s390x
 ;; Vector bitcast is a no-op if the lane count remains unchanged,
 ;; or if the ABI lane-order matches the specified byte order.
 ;; Otherwise, lane-swaps need to happen.
 function %bitcast_i64x2_i32x4(i64x2) -> i32x4 {
 block0(v0: i64x2):
  v1 = bitcast.i32x4 big v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_i64x2_i32x4(i64x2) -> i32x4 {
 block0(v0: i64x2):
  v1 = bitcast.i32x4 little v0
  return v1
 }
 ; block0:
 ;   vpdi %v3, %v24, %v24, 4
 ;   vpdi %v5, %v3, %v3, 4
 ;   verllg %v24, %v5, 32
 ;   br %r14
 function %bitcast_i64x2_i32x4(i64x2) -> i32x4 wasmtime_system_v {
 block0(v0: i64x2):
  v1 = bitcast.i32x4 big v0
  return v1
 }
 ; block0:
 ;   vpdi %v3, %v24, %v24, 4
 ;   vpdi %v5, %v3, %v3, 4
 ;   verllg %v24, %v5, 32
 ;   br %r14
 function %bitcast_i64x2_i32x4(i64x2) -> i32x4 wasmtime_system_v {
 block0(v0: i64x2):
  v1 = bitcast.i32x4 little v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_i64x2_f64x2(i64x2) -> f64x2 {
 block0(v0: i64x2):
  v1 = bitcast.f64x2 big v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_i64x2_f64x2(i64x2) -> f64x2 {
 block0(v0: i64x2):
  v1 = bitcast.f64x2 little v0
  return v1
 }
 ; block0:
 ;   br %r14
 function %bitcast_i64x2_f64x2(i64x2) -> f64x2 wasmtime_system_v {
 block0(v0: i64x2):
  v1 = bitcast.f64x2 big v0
  return v1
 }
 ; block0:
 ;   br %r14
--- a/cranelift/filetests/filetests/isa/x64/move-elision.clif
+++ b/cranelift/filetests/filetests/isa/x64/move-elision.clif
@@ -7,9 +7,9 @@ block0(v0: i32x4):
    ;; In the x64 backend, all of these pseudo-instructions are lowered to moves between registers (e.g. MOVAPD, MOVDQA,
    ;; etc.). Because these have been marked as moves, no instructions are emitted by this function besides the prologue
    ;; and epilogue.
-    v1 = bitcast.f32x4 v0
+    v1 = bitcast.f32x4 little v0
-    v2 = bitcast.f64x2 v1
+    v2 = bitcast.f64x2 little v1
-    v3 = bitcast.i8x16 v2
+    v3 = bitcast.i8x16 little v2
    return v3
 }
--- a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
@@ -43,7 +43,7 @@ block0(v0: f32x4, v1: f32x4, v2: i32x4, v3: i32x4):
 function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
 block0(v0: i8x16, v1: i8x16, v2: i32x4):
-    v3 = bitcast.i8x16 v2
+    v3 = bitcast.i8x16 little v2
    v4 = bitselect v3, v0, v1
    return v4
 }
--- a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif
@@ -16,7 +16,7 @@ block0(v0: i32x4, v1: i32x4):
 function %mask_casted(i64x2, i64x2, i32x4) -> i64x2 {
 block0(v0: i64x2, v1: i64x2, v2: i32x4):
-    v3 = bitcast.i64x2 v2
+    v3 = bitcast.i64x2 little v2
    v4 = bitselect v3, v0, v1
    return v4
 }
--- a/cranelift/filetests/filetests/runtests/simd-lane-access.clif
+++ b/cranelift/filetests/filetests/runtests/simd-lane-access.clif
@@ -26,10 +26,10 @@ block0:
 function %shuffle_i32x4_in_same_place() -> i32x4 {
 block0:
    v1 = vconst.i32x4 [0 1 2 3]
-    v2 = bitcast.i8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
+    v2 = bitcast.i8x16 little v1 ; we have to cast because shuffle is type-limited to Tx16
    ; keep each lane in place from the first vector
    v3 = shuffle v2, v2, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
-    v4 = bitcast.i32x4 v3
+    v4 = bitcast.i32x4 little v3
    return v4
 }
 ; run: %shuffle_in_same_place() == [0 1 2 3]
@@ -37,10 +37,10 @@ block0:
 function %shuffle_i32x4_to_all_true() -> i32x4 {
 block0:
    v1 = vconst.i32x4 [-1 0 -1 0]
-    v2 = bitcast.i8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
+    v2 = bitcast.i8x16 little v1 ; we have to cast because shuffle is type-limited to Tx16
    ; pair up the true values to make the entire vector true
    v3 = shuffle v2, v2, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
-    v4 = bitcast.i32x4 v3 ; TODO store.i32x4 is unavailable; see https://github.com/bytecodealliance/wasmtime/issues/2237
+    v4 = bitcast.i32x4 little v3 ; TODO store.i32x4 is unavailable; see https://github.com/bytecodealliance/wasmtime/issues/2237
    return v4
 }
 ; run: %shuffle_i32x4_to_all_true() == [0xffffffff 0xffffffff 0xffffffff 0xffffffff]
--- a/cranelift/filetests/filetests/verifier/bitcast.clif
+++ b/cranelift/filetests/filetests/verifier/bitcast.clif
@@ -21,3 +21,34 @@ block0(v0: i64):
    return v1
 }
 ; "little"/"big" flag modifier is ok
 function %bitcast_little(i32) -> f32 { ; Ok
 block0(v0: i32):
    v1 = bitcast.f32 little v0
    return v1
 }
 function %bitcast_big(i32) -> f32 { ; Ok
 block0(v0: i32):
    v1 = bitcast.f32 big v0
    return v1
 }
 ; other flag modifiers are not ok
 function %bitcast_big(i32) -> f32 {
 block0(v0: i32):
    v1 = bitcast.f32 notrap v0 ;  error: The bitcast instruction only accepts the `big` or `little` memory flags
    return v1
 }
 function %bitcast_big(i32) -> f32 {
 block0(v0: i32):
    v1 = bitcast.f32 aligned v0 ;  error: The bitcast instruction only accepts the `big` or `little` memory flags
    return v1
 }
 ; if lane counts differ, a byte order specifier is required
 function %bitcast_lanes(i32x4) -> i64x2 {
 block0(v0: i32x4):
    v1 = bitcast.i64x2 v0 ;  error: Byte order specifier required for bitcast instruction changing lane count
    return v1
 }
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -868,19 +868,19 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
        }
        Operator::F32ReinterpretI32 => {
            let val = state.pop1();
-            state.push1(builder.ins().bitcast(F32, val));
+            state.push1(builder.ins().bitcast(F32, MemFlags::new(), val));
        }
        Operator::F64ReinterpretI64 => {
            let val = state.pop1();
-            state.push1(builder.ins().bitcast(F64, val));
+            state.push1(builder.ins().bitcast(F64, MemFlags::new(), val));
        }
        Operator::I32ReinterpretF32 => {
            let val = state.pop1();
-            state.push1(builder.ins().bitcast(I32, val));
+            state.push1(builder.ins().bitcast(I32, MemFlags::new(), val));
        }
        Operator::I64ReinterpretF64 => {
            let val = state.pop1();
-            state.push1(builder.ins().bitcast(I64, val));
+            state.push1(builder.ins().bitcast(I64, MemFlags::new(), val));
        }
        Operator::I32Extend8S => {
            let val = state.pop1();
@@ -2898,7 +2898,9 @@ fn optionally_bitcast_vector(
    builder: &mut FunctionBuilder,
 ) -> Value {
    if builder.func.dfg.value_type(value) != needed_type {
-        builder.ins().bitcast(needed_type, value)
+        let mut flags = MemFlags::new();
        flags.set_endianness(ir::Endianness::Little);
        builder.ins().bitcast(needed_type, flags, value)
    } else {
        value
    }
@@ -2933,7 +2935,9 @@ fn canonicalise_v128_values<'a>(
    // Otherwise we'll have to cast, and push the resulting `Value`s into `canonicalised`.
    for v in values {
        tmp_canonicalised.push(if is_non_canonical_v128(builder.func.dfg.value_type(*v)) {
-            builder.ins().bitcast(I8X16, *v)
+            let mut flags = MemFlags::new();
            flags.set_endianness(ir::Endianness::Little);
            builder.ins().bitcast(I8X16, flags, *v)
        } else {
            *v
        });
@@ -3056,7 +3060,9 @@ pub fn bitcast_wasm_returns<FE: FuncEnvironment + ?Sized>(
        environ.is_wasm_return(&builder.func.signature, i)
    });
    for (t, arg) in changes {
-        *arg = builder.ins().bitcast(t, *arg);
+        let mut flags = MemFlags::new();
        flags.set_endianness(ir::Endianness::Little);
        *arg = builder.ins().bitcast(t, flags, *arg);
    }
 }
@@ -3072,6 +3078,8 @@ fn bitcast_wasm_params<FE: FuncEnvironment + ?Sized>(
        environ.is_wasm_parameter(&callee_signature, i)
    });
    for (t, arg) in changes {
-        *arg = builder.ins().bitcast(t, *arg);
+        let mut flags = MemFlags::new();
        flags.set_endianness(ir::Endianness::Little);
        *arg = builder.ins().bitcast(t, flags, *arg);
    }
 }