diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index cc0582e854..c1b9f4e9fa 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3104,6 +3104,7 @@ pub(crate) fn define(
 
     let x = &Operand::new("x", Mem);
     let a = &Operand::new("a", MemTo).with_doc("Bits of `x` reinterpreted");
+    let MemFlags = &Operand::new("MemFlags", &imm.memflags);
 
     ig.push(
         Inst::new(
@@ -3113,11 +3114,16 @@ pub(crate) fn define(
 
         The input and output types must be storable to memory and of the same
         size. A bitcast is equivalent to storing one type and loading the other
-        type from the same address.
+        type from the same address, both using the specified MemFlags.
+
+        Note that this operation only supports the `big` or `little` MemFlags.
+        The specified byte order only affects the result in the case where
+        input and output types differ in lane count/size.  In this case, the
+        operation is only valid if a byte order specifier is provided.
         "#,
-            &formats.unary,
+            &formats.load_no_offset,
         )
-        .operands_in(vec![x])
+        .operands_in(vec![MemFlags, x])
         .operands_out(vec![a]),
     );
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index b441b10272..a9cc3846a9 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -2197,25 +2197,25 @@
 ;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ; SIMD&FP <=> SIMD&FP
-(rule 5 (lower (has_type (ty_float_or_vec _) (bitcast x @ (value_type (ty_float_or_vec _)))))
+(rule 5 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type (ty_float_or_vec _)))))
       x)
 
 ; GPR => SIMD&FP
-(rule 4 (lower (has_type (ty_float_or_vec _) (bitcast x @ (value_type in_ty))))
+(rule 4 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type in_ty))))
       (if (ty_int_ref_scalar_64 in_ty))
       (mov_to_fpu x (scalar_size in_ty)))
 
 ; SIMD&FP => GPR
-(rule 3 (lower (has_type out_ty (bitcast x @ (value_type (fits_in_64 (ty_float_or_vec _))))))
+(rule 3 (lower (has_type out_ty (bitcast _ x @ (value_type (fits_in_64 (ty_float_or_vec _))))))
       (if (ty_int_ref_scalar_64 out_ty))
       (mov_from_vec x 0 (scalar_size out_ty)))
 
 ; GPR <=> GPR
-(rule 2 (lower (has_type out_ty (bitcast x @ (value_type in_ty))))
+(rule 2 (lower (has_type out_ty (bitcast _ x @ (value_type in_ty))))
       (if (ty_int_ref_scalar_64 out_ty))
       (if (ty_int_ref_scalar_64 in_ty))
       x)
-(rule 1 (lower (has_type $I128 (bitcast x @ (value_type $I128)))) x)
+(rule 1 (lower (has_type $I128 (bitcast _ x @ (value_type $I128)))) x)
 
 ;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 750e2a5cf1..c30087525f 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -811,7 +811,7 @@
 )
 ;;;;;  Rules for `bitcast`;;;;;;;;;
 (rule
-   (lower (has_type out (bitcast v @ (value_type in_ty))))
+   (lower (has_type out (bitcast _ v @ (value_type in_ty))))
    (gen_moves v in_ty out))
 
 ;;;;;  Rules for `ceil`;;;;;;;;;
diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle
index 5d519abbaa..5261138ba2 100644
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@@ -1486,6 +1486,11 @@
 (rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.LittleEndian)) $false)
 (rule (lane_order_equal (LaneOrder.BigEndian) (LaneOrder.BigEndian)) $true)
 
+;; Return lane order matching memory byte order.
+(decl pure lane_order_from_memflags (MemFlags) LaneOrder)
+(rule 0 (lane_order_from_memflags (littleendian)) (LaneOrder.LittleEndian))
+(rule 1 (lane_order_from_memflags (bigendian)) (LaneOrder.BigEndian))
+
 ;; Convert a CLIF immediate lane index value to big-endian lane order.
 (decl be_lane_idx (Type u8) u8)
 (extern constructor be_lane_idx be_lane_idx)
diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle
index 6c2629aab3..0e4ec27dec 100644
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -1738,40 +1738,46 @@
 ;;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Reinterpret a 64-bit integer value as floating-point.
-(rule (lower (has_type $F64 (bitcast x @ (value_type $I64))))
+(rule (lower (has_type $F64 (bitcast _ x @ (value_type $I64))))
       (vec_insert_lane_undef $F64X2 x 0 (zero_reg)))
 
 ;; Reinterpret a 64-bit floating-point value as integer.
-(rule (lower (has_type $I64 (bitcast x @ (value_type $F64))))
+(rule (lower (has_type $I64 (bitcast _ x @ (value_type $F64))))
       (vec_extract_lane $F64X2 x 0 (zero_reg)))
 
 ;; Reinterpret a 32-bit integer value as floating-point.
-(rule (lower (has_type $F32 (bitcast x @ (value_type $I32))))
+(rule (lower (has_type $F32 (bitcast _ x @ (value_type $I32))))
       (vec_insert_lane_undef $F32X4 x 0 (zero_reg)))
 
 ;; Reinterpret a 32-bit floating-point value as integer.
-(rule (lower (has_type $I32 (bitcast x @ (value_type $F32))))
+(rule (lower (has_type $I32 (bitcast _ x @ (value_type $F32))))
       (vec_extract_lane $F32X4 x 0 (zero_reg)))
 
 ;; Bitcast between types residing in GPRs is a no-op.
 (rule 1 (lower (has_type (gpr32_ty _)
-                         (bitcast x @ (value_type (gpr32_ty _))))) x)
+                         (bitcast _ x @ (value_type (gpr32_ty _))))) x)
 (rule 2 (lower (has_type (gpr64_ty _)
-                         (bitcast x @ (value_type (gpr64_ty _))))) x)
+                         (bitcast _ x @ (value_type (gpr64_ty _))))) x)
 
 ;; Bitcast between types residing in FPRs is a no-op.
 (rule 3 (lower (has_type (ty_scalar_float _)
-                         (bitcast x @ (value_type (ty_scalar_float _))))) x)
+                         (bitcast _ x @ (value_type (ty_scalar_float _))))) x)
 
-;; Bitcast between types residing in VRs is a no-op.
-;; FIXME: There are two flavors of vector bitcast, which are currently not
-;; distinguished in CLIF IR.  Those generated by Wasmtime assume little-endian
-;; lane order, and those generated elsewhere assume big-endian lane order.
-;; Bitcast is a no-op if current lane order matches that assumed lane order.
-;; However, due to our choice of lane order depending on the current function
-;; ABI, every bitcast we currently see here is indeed a no-op.
-(rule 4 (lower (has_type (vr128_ty _)
-                         (bitcast x @ (value_type (vr128_ty _))))) x)
+;; Bitcast between types residing in VRs is a no-op if lane count is unchanged.
+(rule 5 (lower (has_type (multi_lane bits count)
+                         (bitcast _ x @ (value_type (multi_lane bits count))))) x)
+
+;; Bitcast between types residing in VRs with different lane counts is a
+;; no-op if the operation's MemFlags indicate a byte order compatible with
+;; the current lane order.  Otherwise, lane elements need to be swapped,
+;; first in the input type, and then again in the output type.  This could
+;; be optimized further, but we don't bother at the moment since due to our
+;; choice of lane order depending on the current function ABI, this case will
+;; currently never arise in practice.
+(rule 4 (lower (has_type (vr128_ty out_ty)
+                         (bitcast flags x @ (value_type (vr128_ty in_ty)))))
+      (abi_vec_elt_rev (lane_order_from_memflags flags) out_ty
+        (abi_vec_elt_rev (lane_order_from_memflags flags) in_ty x)))
 
 
 ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index f08d6ad5a7..326a83258b 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3298,25 +3298,25 @@
 
 ;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule (lower (has_type $I32 (bitcast src @ (value_type $F32))))
+(rule (lower (has_type $I32 (bitcast _ src @ (value_type $F32))))
       (bitcast_xmm_to_gpr $F32 src))
 
-(rule (lower (has_type $F32 (bitcast src @ (value_type $I32))))
+(rule (lower (has_type $F32 (bitcast _ src @ (value_type $I32))))
       (bitcast_gpr_to_xmm $I32 src))
 
-(rule (lower (has_type $I64 (bitcast src @ (value_type $F64))))
+(rule (lower (has_type $I64 (bitcast _ src @ (value_type $F64))))
       (bitcast_xmm_to_gpr $F64 src))
 
-(rule (lower (has_type $F64 (bitcast src @ (value_type $I64))))
+(rule (lower (has_type $F64 (bitcast _ src @ (value_type $I64))))
       (bitcast_gpr_to_xmm $I64 src))
 
 ;; Bitcast between types residing in GPR registers is a no-op.
 (rule 1 (lower (has_type (is_gpr_type _)
-                         (bitcast x @ (value_type (is_gpr_type _))))) x)
+                         (bitcast _ x @ (value_type (is_gpr_type _))))) x)
 
 ;; Bitcast between types residing in XMM registers is a no-op.
 (rule 2 (lower (has_type (is_xmm_type _)
-                         (bitcast x @ (value_type (is_xmm_type _))))) x)
+                         (bitcast _ x @ (value_type (is_xmm_type _))))) x)
 
 ;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/codegen/src/verifier/mod.rs b/cranelift/codegen/src/verifier/mod.rs
index 80c79eb456..d47817e09e 100644
--- a/cranelift/codegen/src/verifier/mod.rs
+++ b/cranelift/codegen/src/verifier/mod.rs
@@ -67,7 +67,7 @@ use crate::ir::entities::AnyEntity;
 use crate::ir::instructions::{BranchInfo, CallInfo, InstructionFormat, ResolvedConstraint};
 use crate::ir::{
     types, ArgumentPurpose, Block, Constant, DynamicStackSlot, FuncRef, Function, GlobalValue,
-    Inst, JumpTable, Opcode, SigRef, StackSlot, Type, Value, ValueDef, ValueList,
+    Inst, JumpTable, MemFlags, Opcode, SigRef, StackSlot, Type, Value, ValueDef, ValueList,
 };
 use crate::isa::TargetIsa;
 use crate::iterators::IteratorExtras;
@@ -729,11 +729,12 @@ impl<'a> Verifier<'a> {
                     ));
                 }
             }
-            Unary {
+            LoadNoOffset {
                 opcode: Opcode::Bitcast,
+                flags,
                 arg,
             } => {
-                self.verify_bitcast(inst, arg, errors)?;
+                self.verify_bitcast(inst, flags, arg, errors)?;
             }
             UnaryConst {
                 opcode: Opcode::Vconst,
@@ -1070,6 +1071,7 @@ impl<'a> Verifier<'a> {
     fn verify_bitcast(
         &self,
         inst: Inst,
+        flags: MemFlags,
         arg: Value,
         errors: &mut VerifierErrors,
     ) -> VerifierStepResult<()> {
@@ -1086,6 +1088,19 @@ impl<'a> Verifier<'a> {
                     typ.bits()
                 ),
             ))
+        } else if flags != MemFlags::new()
+            && flags != MemFlags::new().with_endianness(ir::Endianness::Little)
+            && flags != MemFlags::new().with_endianness(ir::Endianness::Big)
+        {
+            errors.fatal((
+                inst,
+                "The bitcast instruction only accepts the `big` or `little` memory flags",
+            ))
+        } else if flags == MemFlags::new() && typ.lane_count() != value_type.lane_count() {
+            errors.fatal((
+                inst,
+                "Byte order specifier required for bitcast instruction changing lane count",
+            ))
         } else {
             Ok(())
         }
diff --git a/cranelift/filetests/filetests/isa/s390x/bitcast.clif b/cranelift/filetests/filetests/isa/s390x/bitcast.clif
new file mode 100644
index 0000000000..6d946badba
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/bitcast.clif
@@ -0,0 +1,79 @@
+test compile precise-output
+target s390x
+
+;; Bitcast between integral types is a no-op.
+
+function %bitcast_i8_i8(i8) -> i8 {
+block0(v0: i8):
+  v1 = bitcast.i8 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i16_i16(i16) -> i16 {
+block0(v0: i16):
+  v1 = bitcast.i16 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i32_i32(i32) -> i32 {
+block0(v0: i32):
+  v1 = bitcast.i32 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64_i64(i64) -> i64 {
+block0(v0: i64):
+  v1 = bitcast.i64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i128_i128(i128) -> i128 {
+block0(v0: i128):
+  v1 = bitcast.i128 v0
+  return v1
+}
+
+; block0:
+;   vl %v0, 0(%r3)
+;   vst %v0, 0(%r2)
+;   br %r14
+
+function %bitcast_r64_i64(r64) -> i64 {
+block0(v0: r64):
+  v1 = bitcast.i64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64_r64(i64) -> r64 {
+block0(v0: i64):
+  v1 = bitcast.r64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_r64_r64(r64) -> r64 {
+block0(v0: r64):
+  v1 = bitcast.r64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/isa/s390x/floating-point.clif b/cranelift/filetests/filetests/isa/s390x/floating-point.clif
index 1bfbf090bc..fe4a6cf41f 100644
--- a/cranelift/filetests/filetests/isa/s390x/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/s390x/floating-point.clif
@@ -1200,3 +1200,21 @@ block0(v0: f32):
 ;   vlgvf %r2, %v0, 0
 ;   br %r14
 
+function %bitcast_f32_f32(f32) -> f32 {
+block0(v0: f32):
+  v1 = bitcast.f32 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_f64_f64(f64) -> f64 {
+block0(v0: f64):
+  v1 = bitcast.f64 v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif b/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif
new file mode 100644
index 0000000000..c0d8ae4563
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/vec-bitcast.clif
@@ -0,0 +1,76 @@
+test compile precise-output
+target s390x
+
+;; Vector bitcast is a no-op if the lane count remains unchanged,
+;; or if the ABI lane-order matches the specified byte order.
+;; Otherwise, lane-swaps need to happen.
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 big v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 little v0
+  return v1
+}
+
+; block0:
+;   vpdi %v3, %v24, %v24, 4
+;   vpdi %v5, %v3, %v3, 4
+;   verllg %v24, %v5, 32
+;   br %r14
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 wasmtime_system_v {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 big v0
+  return v1
+}
+
+; block0:
+;   vpdi %v3, %v24, %v24, 4
+;   vpdi %v5, %v3, %v3, 4
+;   verllg %v24, %v5, 32
+;   br %r14
+
+function %bitcast_i64x2_i32x4(i64x2) -> i32x4 wasmtime_system_v {
+block0(v0: i64x2):
+  v1 = bitcast.i32x4 little v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_f64x2(i64x2) -> f64x2 {
+block0(v0: i64x2):
+  v1 = bitcast.f64x2 big v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_f64x2(i64x2) -> f64x2 {
+block0(v0: i64x2):
+  v1 = bitcast.f64x2 little v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
+function %bitcast_i64x2_f64x2(i64x2) -> f64x2 wasmtime_system_v {
+block0(v0: i64x2):
+  v1 = bitcast.f64x2 big v0
+  return v1
+}
+
+; block0:
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/isa/x64/move-elision.clif b/cranelift/filetests/filetests/isa/x64/move-elision.clif
index 08280bae2d..f879653384 100644
--- a/cranelift/filetests/filetests/isa/x64/move-elision.clif
+++ b/cranelift/filetests/filetests/isa/x64/move-elision.clif
@@ -7,9 +7,9 @@ block0(v0: i32x4):
     ;; In the x64 backend, all of these pseudo-instructions are lowered to moves between registers (e.g. MOVAPD, MOVDQA,
     ;; etc.). Because these have been marked as moves, no instructions are emitted by this function besides the prologue
     ;; and epilogue.
-    v1 = bitcast.f32x4 v0
-    v2 = bitcast.f64x2 v1
-    v3 = bitcast.i8x16 v2
+    v1 = bitcast.f32x4 little v0
+    v2 = bitcast.f64x2 little v1
+    v3 = bitcast.i8x16 little v2
     return v3
 }
 
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
index edcc1f2771..11c4907c23 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
@@ -43,7 +43,7 @@ block0(v0: f32x4, v1: f32x4, v2: i32x4, v3: i32x4):
 
 function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
 block0(v0: i8x16, v1: i8x16, v2: i32x4):
-    v3 = bitcast.i8x16 v2
+    v3 = bitcast.i8x16 little v2
     v4 = bitselect v3, v0, v1
     return v4
 }
diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif
index 20d7c05e38..0c55444559 100644
--- a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif
@@ -16,7 +16,7 @@ block0(v0: i32x4, v1: i32x4):
 
 function %mask_casted(i64x2, i64x2, i32x4) -> i64x2 {
 block0(v0: i64x2, v1: i64x2, v2: i32x4):
-    v3 = bitcast.i64x2 v2
+    v3 = bitcast.i64x2 little v2
     v4 = bitselect v3, v0, v1
     return v4
 }
diff --git a/cranelift/filetests/filetests/runtests/simd-lane-access.clif b/cranelift/filetests/filetests/runtests/simd-lane-access.clif
index a73e6dca5d..412de68046 100644
--- a/cranelift/filetests/filetests/runtests/simd-lane-access.clif
+++ b/cranelift/filetests/filetests/runtests/simd-lane-access.clif
@@ -26,10 +26,10 @@ block0:
 function %shuffle_i32x4_in_same_place() -> i32x4 {
 block0:
     v1 = vconst.i32x4 [0 1 2 3]
-    v2 = bitcast.i8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
+    v2 = bitcast.i8x16 little v1 ; we have to cast because shuffle is type-limited to Tx16
     ; keep each lane in place from the first vector
     v3 = shuffle v2, v2, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
-    v4 = bitcast.i32x4 v3
+    v4 = bitcast.i32x4 little v3
     return v4
 }
 ; run: %shuffle_in_same_place() == [0 1 2 3]
@@ -37,10 +37,10 @@ block0:
 function %shuffle_i32x4_to_all_true() -> i32x4 {
 block0:
     v1 = vconst.i32x4 [-1 0 -1 0]
-    v2 = bitcast.i8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
+    v2 = bitcast.i8x16 little v1 ; we have to cast because shuffle is type-limited to Tx16
     ; pair up the true values to make the entire vector true
     v3 = shuffle v2, v2, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
-    v4 = bitcast.i32x4 v3 ; TODO store.i32x4 is unavailable; see https://github.com/bytecodealliance/wasmtime/issues/2237
+    v4 = bitcast.i32x4 little v3 ; TODO store.i32x4 is unavailable; see https://github.com/bytecodealliance/wasmtime/issues/2237
     return v4
 }
 ; run: %shuffle_i32x4_to_all_true() == [0xffffffff 0xffffffff 0xffffffff 0xffffffff]
diff --git a/cranelift/filetests/filetests/verifier/bitcast.clif b/cranelift/filetests/filetests/verifier/bitcast.clif
index 5ed7b8386d..6c936c8ff0 100644
--- a/cranelift/filetests/filetests/verifier/bitcast.clif
+++ b/cranelift/filetests/filetests/verifier/bitcast.clif
@@ -21,3 +21,34 @@ block0(v0: i64):
     return v1
 }
 
+; "little"/"big" flag modifier is ok
+function %bitcast_little(i32) -> f32 { ; Ok
+block0(v0: i32):
+    v1 = bitcast.f32 little v0
+    return v1
+}
+function %bitcast_big(i32) -> f32 { ; Ok
+block0(v0: i32):
+    v1 = bitcast.f32 big v0
+    return v1
+}
+
+; other flag modifiers are not ok
+function %bitcast_big(i32) -> f32 {
+block0(v0: i32):
+    v1 = bitcast.f32 notrap v0 ;  error: The bitcast instruction only accepts the `big` or `little` memory flags
+    return v1
+}
+function %bitcast_big(i32) -> f32 {
+block0(v0: i32):
+    v1 = bitcast.f32 aligned v0 ;  error: The bitcast instruction only accepts the `big` or `little` memory flags
+    return v1
+}
+
+; if lane counts differ, a byte order specifier is required
+function %bitcast_lanes(i32x4) -> i64x2 {
+block0(v0: i32x4):
+    v1 = bitcast.i64x2 v0 ;  error: Byte order specifier required for bitcast instruction changing lane count
+    return v1
+}
+
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 7122ba5f91..86d54ffc03 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -868,19 +868,19 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         }
         Operator::F32ReinterpretI32 => {
             let val = state.pop1();
-            state.push1(builder.ins().bitcast(F32, val));
+            state.push1(builder.ins().bitcast(F32, MemFlags::new(), val));
         }
         Operator::F64ReinterpretI64 => {
             let val = state.pop1();
-            state.push1(builder.ins().bitcast(F64, val));
+            state.push1(builder.ins().bitcast(F64, MemFlags::new(), val));
         }
         Operator::I32ReinterpretF32 => {
             let val = state.pop1();
-            state.push1(builder.ins().bitcast(I32, val));
+            state.push1(builder.ins().bitcast(I32, MemFlags::new(), val));
         }
         Operator::I64ReinterpretF64 => {
             let val = state.pop1();
-            state.push1(builder.ins().bitcast(I64, val));
+            state.push1(builder.ins().bitcast(I64, MemFlags::new(), val));
         }
         Operator::I32Extend8S => {
             let val = state.pop1();
@@ -2898,7 +2898,9 @@ fn optionally_bitcast_vector(
     builder: &mut FunctionBuilder,
 ) -> Value {
     if builder.func.dfg.value_type(value) != needed_type {
-        builder.ins().bitcast(needed_type, value)
+        let mut flags = MemFlags::new();
+        flags.set_endianness(ir::Endianness::Little);
+        builder.ins().bitcast(needed_type, flags, value)
     } else {
         value
     }
@@ -2933,7 +2935,9 @@ fn canonicalise_v128_values<'a>(
     // Otherwise we'll have to cast, and push the resulting `Value`s into `canonicalised`.
     for v in values {
         tmp_canonicalised.push(if is_non_canonical_v128(builder.func.dfg.value_type(*v)) {
-            builder.ins().bitcast(I8X16, *v)
+            let mut flags = MemFlags::new();
+            flags.set_endianness(ir::Endianness::Little);
+            builder.ins().bitcast(I8X16, flags, *v)
         } else {
             *v
         });
@@ -3056,7 +3060,9 @@ pub fn bitcast_wasm_returns<FE: FuncEnvironment + ?Sized>(
         environ.is_wasm_return(&builder.func.signature, i)
     });
     for (t, arg) in changes {
-        *arg = builder.ins().bitcast(t, *arg);
+        let mut flags = MemFlags::new();
+        flags.set_endianness(ir::Endianness::Little);
+        *arg = builder.ins().bitcast(t, flags, *arg);
     }
 }
 
@@ -3072,6 +3078,8 @@ fn bitcast_wasm_params<FE: FuncEnvironment + ?Sized>(
         environ.is_wasm_parameter(&callee_signature, i)
     });
     for (t, arg) in changes {
-        *arg = builder.ins().bitcast(t, *arg);
+        let mut flags = MemFlags::new();
+        flags.set_endianness(ir::Endianness::Little);
+        *arg = builder.ins().bitcast(t, flags, *arg);
     }
 }