cranelift: Add Bswap instruction (#1092) (#5147)

Adds Bswap to the Cranelift IR. Implements the Bswap instruction in the x64 and aarch64 codegen backends. Cranelift users can now: ``` builder.ins().bswap(value) ``` to get a native byteswap instruction. * x64: implements the 32- and 64-bit bswap instruction, following the pattern set by similar unary instrutions (Neg and Not) - it only operates on a dst register, but is parameterized with both a src and dst which are expected to be the same register. As x64 bswap instruction is only for 32- or 64-bit registers, the 16-bit swap is implemented as a rotate left by 8. Updated x64 RexFlags type to support emitting for single-operand instructions like bswap * aarch64: Bswap gets emitted as aarch64 rev16, rev32, or rev64 instruction as appropriate. * s390x: Bswap was already supported in backend, just had to add a bit of plumbing * For completeness, added bswap to the interpreter as well. * added filetests and runtests for each ISA * added bswap to fuzzgen, thanks to afonso360 for the code there * 128-bit swaps are not yet implemented, that can be done later
2022-10-31 12:30:00 -07:00
parent 95ecb7e4d4
commit 4ca9e82bd1
24 changed files with 455 additions and 0 deletions
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -643,6 +643,12 @@ pub(crate) fn define(
        TypeSetBuilder::new().ints(Interval::All).build(),
    );

+    let iSwappable = &TypeVar::new(
+        "iSwappable",
+        "A multi byte scalar integer type",
+        TypeSetBuilder::new().ints(16..128).build(),
+    );
+
    let iAddr = &TypeVar::new(
        "iAddr",
        "An integer address type",
@@ -2699,6 +2705,23 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let x = &Operand::new("x", iSwappable);
+    let a = &Operand::new("a", iSwappable);
+
+    ig.push(
+        Inst::new(
+            "bswap",
+            r#"
+        Reverse the byte order of an integer.
+
+        Reverses the bytes in ``x``.
+        "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
    let x = &Operand::new("x", Int);
    let a = &Operand::new("a", Int);

--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -1023,6 +1023,10 @@
    (RBit)
    (Clz)
    (Cls)
+    ;; Byte reverse
+    (Rev16)
+    (Rev32)
+    (Rev64)
 ))

 (type MemLabel extern (enum))
@@ -2579,6 +2583,17 @@
 (decl a64_cls (Type Reg) Reg)
 (rule (a64_cls ty x) (bit_rr (BitOp.Cls) ty x))

+;; Helpers for generating `rev` instructions
+
+(decl a64_rev16 (Type Reg) Reg)
+(rule (a64_rev16 ty x) (bit_rr (BitOp.Rev16) ty x))
+
+(decl a64_rev32 (Type Reg) Reg)
+(rule (a64_rev32 ty x) (bit_rr (BitOp.Rev32) ty x))
+
+(decl a64_rev64 (Type Reg) Reg)
+(rule (a64_rev64 ty x) (bit_rr (BitOp.Rev64) ty x))
+
 ;; Helpers for generating `eon` instructions.

 (decl eon (Type Reg Reg) Reg)
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -934,6 +934,9 @@ impl MachInstEmit for Inst {
                    BitOp::RBit => (0b00000, 0b000000),
                    BitOp::Clz => (0b00000, 0b000100),
                    BitOp::Cls => (0b00000, 0b000101),
+                    BitOp::Rev16 => (0b00000, 0b000001),
+                    BitOp::Rev32 => (0b00000, 0b000010),
+                    BitOp::Rev64 => (0b00000, 0b000011),
                };
                sink.put4(enc_bit_rr(size.sf_bit(), op1, op2, rn, rd))
            }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1375,6 +1375,61 @@ fn test_aarch64_binemit() {
        "cls x21, x16",
    ));

+    insns.push((
+        Inst::BitRR {
+            op: BitOp::Rev16,
+            size: OperandSize::Size64,
+            rd: writable_xreg(2),
+            rn: xreg(11),
+        },
+        "6205C0DA",
+        "rev16 x2, x11",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::Rev16,
+            size: OperandSize::Size32,
+            rd: writable_xreg(3),
+            rn: xreg(21),
+        },
+        "A306C05A",
+        "rev16 w3, w21",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::Rev32,
+            size: OperandSize::Size64,
+            rd: writable_xreg(2),
+            rn: xreg(11),
+        },
+        "6209C0DA",
+        "rev32 x2, x11",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::Rev32,
+            size: OperandSize::Size32,
+            rd: writable_xreg(3),
+            rn: xreg(21),
+        },
+        "A30AC05A",
+        "rev32 w3, w21",
+    ));
+
+    insns.push((
+        Inst::BitRR {
+            op: BitOp::Rev64,
+            size: OperandSize::Size64,
+            rd: writable_xreg(1),
+            rn: xreg(10),
+        },
+        "410DC0DA",
+        "rev64 x1, x10",
+    ));
+
    insns.push((
        Inst::ULoad8 {
            rd: writable_xreg(1),
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -67,6 +67,9 @@ impl BitOp {
            BitOp::RBit => "rbit",
            BitOp::Clz => "clz",
            BitOp::Cls => "cls",
+            BitOp::Rev16 => "rev16",
+            BitOp::Rev32 => "rev32",
+            BitOp::Rev64 => "rev64",
        }
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -1517,6 +1517,17 @@
 (rule -1 (lower (has_type ty (cls x)))
      (a64_cls ty x))

+;;;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $I16 (bswap x)))
+      (a64_rev16 $I16 x))
+
+(rule (lower (has_type $I32 (bswap x)))
+      (a64_rev32 $I32 x))
+
+(rule (lower (has_type $I64 (bswap x)))
+      (a64_rev64 $I64 x))
+
 ;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Bmask tests the value against zero, and uses `csetm` to assert the result.
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -96,6 +96,8 @@ pub(crate) fn lower_insn_to_regs(

        Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => implemented_in_isle(ctx),

+        Opcode::Bswap => implemented_in_isle(ctx),
+
        Opcode::Popcnt => implemented_in_isle(ctx),

        Opcode::Load
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -1188,6 +1188,18 @@
                                            7 6 5 4 3 2 1 0))))


+;;;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $I16 (bswap x)))
+      (lshr_imm $I32 (bswap_reg $I32 x) 16))
+
+(rule (lower (has_type $I32 (bswap x)))
+      (bswap_reg $I32 x))
+
+(rule (lower (has_type $I64 (bswap x)))
+      (bswap_reg $I64 x))
+
+
 ;;;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; The FLOGR hardware instruction always operates on the full 64-bit register.
--- a/cranelift/codegen/src/isa/s390x/lower.rs
+++ b/cranelift/codegen/src/isa/s390x/lower.rs
@@ -100,6 +100,7 @@ impl LowerBackend for S390xBackend {
            | Opcode::Vselect
            | Opcode::Bmask
            | Opcode::Bitrev
+            | Opcode::Bswap
            | Opcode::Clz
            | Opcode::Cls
            | Opcode::Ctz
--- a/cranelift/codegen/src/isa/x64/encoding/rex.rs
+++ b/cranelift/codegen/src/isa/x64/encoding/rex.rs
@@ -105,6 +105,21 @@ impl RexFlags {
        (self.0 & 2) != 0
    }

+    #[inline(always)]
+    pub(crate) fn emit_one_op(&self, sink: &mut MachBuffer<Inst>, enc_e: u8) {
+        // Register Operand coded in Opcode Byte
+        // REX.R and REX.X unused
+        // REX.B == 1 accesses r8-r15
+        let w = if self.must_clear_w() { 0 } else { 1 };
+        let r = 0;
+        let x = 0;
+        let b = (enc_e >> 3) & 1;
+        let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+        if rex != 0x40 || self.must_always_emit() {
+            sink.put1(rex);
+        }
+    }
+
    #[inline(always)]
    pub(crate) fn emit_two_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_e: u8) {
        let w = if self.must_clear_w() { 0 } else { 1 };
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -151,6 +151,11 @@
       (Setcc (cc CC)
              (dst WritableGpr))

+       ;; Swaps byte order in register
+       (Bswap (size OperandSize) ;; 4 or 8
+              (src Gpr)
+              (dst WritableGpr))
+
       ;; =========================================
       ;; Conditional moves.

@@ -1959,6 +1964,16 @@
 (rule (x64_sar ty src1 src2)
      (shift_r ty (ShiftKind.ShiftRightArithmetic) src1 src2))

+;; Helper for creating byteswap instructions.
+;; In x64, 32- and 64-bit registers use BSWAP instruction, and
+;; for 16-bit registers one must instead use xchg or rol/ror
+(decl x64_bswap (Type Gpr) Gpr)
+(rule (x64_bswap ty src)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (size OperandSize (operand_size_of_type_32_64 ty))
+            (_ Unit (emit (MInst.Bswap size src dst))))
+        dst))
+
 ;; Helper for creating `MInst.CmpRmiR` instructions.
 (decl cmp_rmi_r (OperandSize CmpOpcode GprMemImm Gpr) ProducesFlags)
 (rule (cmp_rmi_r size opcode src1 src2)
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1115,6 +1115,21 @@ pub(crate) fn emit(
            );
        }

+        Inst::Bswap { size, src, dst } => {
+            let src = allocs.next(src.to_reg());
+            let dst = allocs.next(dst.to_reg().to_reg());
+            debug_assert_eq!(src, dst);
+            let enc_reg = int_reg_enc(dst);
+
+            // BSWAP reg32 is (REX.W==0) 0F C8
+            // BSWAP reg64 is (REX.W==1) 0F C8
+            let rex_flags = RexFlags::from(*size);
+            rex_flags.emit_one_op(sink, enc_reg);
+
+            sink.put1(0x0F);
+            sink.put1(0xC8 | (enc_reg & 7));
+        }
+
        Inst::Cmove {
            size,
            cc,
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -107,6 +107,13 @@ impl Inst {
        Inst::Setcc { cc, dst }
    }

+    fn bswap(size: OperandSize, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().class() == RegClass::Int);
+        let src = Gpr::new(dst.to_reg()).unwrap();
+        let dst = WritableGpr::from_writable_reg(dst).unwrap();
+        Inst::Bswap { size, src, dst }
+    }
+
    fn xmm_rm_r_imm(
        op: SseOpcode,
        src: RegMem,
@@ -3505,6 +3512,55 @@ fn test_x64_emit() {
    insns.push((Inst::setcc(CC::LE, w_r14), "410F9EC6", "setle   %r14b"));
    insns.push((Inst::setcc(CC::P, w_r9), "410F9AC1", "setp    %r9b"));
    insns.push((Inst::setcc(CC::NP, w_r8), "410F9BC0", "setnp   %r8b"));
+
+    // ========================================================
+    // Bswap
+    insns.push((
+        Inst::bswap(OperandSize::Size64, w_rax),
+        "480FC8",
+        "bswapq  %rax, %rax",
+    ));
+    insns.push((
+        Inst::bswap(OperandSize::Size64, w_r8),
+        "490FC8",
+        "bswapq  %r8, %r8",
+    ));
+    insns.push((
+        Inst::bswap(OperandSize::Size32, w_rax),
+        "0FC8",
+        "bswapl  %eax, %eax",
+    ));
+    insns.push((
+        Inst::bswap(OperandSize::Size64, w_rcx),
+        "480FC9",
+        "bswapq  %rcx, %rcx",
+    ));
+    insns.push((
+        Inst::bswap(OperandSize::Size32, w_rcx),
+        "0FC9",
+        "bswapl  %ecx, %ecx",
+    ));
+    insns.push((
+        Inst::bswap(OperandSize::Size64, w_r11),
+        "490FCB",
+        "bswapq  %r11, %r11",
+    ));
+    insns.push((
+        Inst::bswap(OperandSize::Size32, w_r11),
+        "410FCB",
+        "bswapl  %r11d, %r11d",
+    ));
+    insns.push((
+        Inst::bswap(OperandSize::Size64, w_r14),
+        "490FCE",
+        "bswapq  %r14, %r14",
+    ));
+    insns.push((
+        Inst::bswap(OperandSize::Size32, w_r14),
+        "410FCE",
+        "bswapl  %r14d, %r14d",
+    ));
+
    // ========================================================
    // Cmove
    insns.push((
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -68,6 +68,7 @@ impl Inst {
            Inst::AluRmiR { .. }
            | Inst::AluRM { .. }
            | Inst::AtomicRmwSeq { .. }
+            | Inst::Bswap { .. }
            | Inst::CallKnown { .. }
            | Inst::CallUnknown { .. }
            | Inst::CheckedDivOrRemSeq { .. }
@@ -1373,6 +1374,17 @@ impl PrettyPrint for Inst {
                format!("{} {}", ljustify2("set".to_string(), cc.to_string()), dst)
            }

+            Inst::Bswap { size, src, dst } => {
+                let src = pretty_print_reg(src.to_reg(), size.to_bytes(), allocs);
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
+                format!(
+                    "{} {}, {}",
+                    ljustify2("bswap".to_string(), suffix_bwlq(*size)),
+                    src,
+                    dst
+                )
+            }
+
            Inst::Cmove {
                size,
                cc,
@@ -1953,6 +1965,10 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
        Inst::Setcc { dst, .. } => {
            collector.reg_def(dst.to_writable_reg());
        }
+        Inst::Bswap { src, dst, .. } => {
+            collector.reg_use(src.to_reg());
+            collector.reg_reuse_def(dst.to_writable_reg(), 0);
+        }
        Inst::Cmove {
            consequent,
            alternative,
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2065,6 +2065,19 @@
                            hi32)))
        swap32))

+;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; x64 bswap instruction is only for 32- or 64-bit swaps
+;; implement the 16-bit swap as a rotl by 8
+(rule (lower (has_type $I16 (bswap src)))
+      (x64_rotl $I16 src (Imm8Reg.Imm8 8)))
+
+(rule (lower (has_type $I32 (bswap src)))
+      (x64_bswap $I32 src))
+
+(rule (lower (has_type $I64 (bswap src)))
+      (x64_bswap $I64 src))
+
 ;; Rules for `is_null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Null references are represented by the constant value `0`.
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -363,6 +363,7 @@ fn lower_insn_to_regs(
        | Opcode::Ctz
        | Opcode::Popcnt
        | Opcode::Bitrev
+        | Opcode::Bswap
        | Opcode::IsNull
        | Opcode::IsInvalid
        | Opcode::Uextend
--- a/cranelift/filetests/filetests/isa/aarch64/bswap.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/bswap.clif
@@ -0,0 +1,34 @@
+test compile precise-output
+set unwind_info=false
+target aarch64
+
+function %f0(i64) -> i64 {
+block0(v0: i64):
+    v1 = bswap v0
+    return v1
+}
+
+; block0:
+;   rev64 x0, x0
+;   ret
+
+function %f1(i32) -> i32 {
+block0(v0: i32):
+    v1 = bswap v0
+    return v1
+}
+
+; block0:
+;   rev32 w0, w0
+;   ret
+
+function %f2(i16) -> i16 {
+block0(v0: i16):
+    v1 = bswap v0
+    return v1
+}
+
+; block0:
+;   rev16 w0, w0
+;   ret
+
--- a/cranelift/filetests/filetests/isa/s390x/bswap.clif
+++ b/cranelift/filetests/filetests/isa/s390x/bswap.clif
@@ -0,0 +1,34 @@
+test compile precise-output
+target s390x
+
+function %bswap_i64(i64) -> i64 {
+block0(v0: i64):
+    v1 = bswap v0
+    return v1
+}
+
+; block0:
+;   lrvgr %r2, %r2
+;   br %r14
+
+function %bswap_i32(i32) -> i32 {
+block0(v0: i32):
+    v1 = bswap v0
+    return v1
+}
+
+; block0:
+;   lrvr %r2, %r2
+;   br %r14
+
+function %bswap_i16(i16) -> i16 {
+block0(v0: i16):
+    v1 = bswap v0
+    return v1
+}
+
+; block0:
+;   lrvr %r5, %r2
+;   srlk %r2, %r5, 16
+;   br %r14
+
--- a/cranelift/filetests/filetests/isa/x64/bswap.clif
+++ b/cranelift/filetests/filetests/isa/x64/bswap.clif
@@ -0,0 +1,48 @@
+test compile precise-output
+target x86_64
+
+function %f0(i64) -> i64 {
+block0(v0: i64):
+  v1 = bswap v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   bswapq  %rax, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f1(i32) -> i32 {
+block0(v0: i32):
+  v1 = bswap v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   bswapl  %eax, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i16) -> i16 {
+block0(v0: i16):
+  v1 = bswap v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   rolw    $8, %ax, %ax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
--- a/cranelift/filetests/filetests/runtests/bswap.clif
+++ b/cranelift/filetests/filetests/runtests/bswap.clif
@@ -0,0 +1,58 @@
+test interpret
+test run
+target x86_64
+target aarch64
+target s390x
+
+function %bswap_i16(i16) -> i16 {
+block0(v0: i16):
+    v1 = bswap v0
+    return v1
+}
+; run: %bswap_i16(0) == 0
+; run: %bswap_i16(1) == 0x0100
+; run: %bswap_i16(0x1234) == 0x3412
+; run: %bswap_i16(-2) == 0xFEFF
+
+function %bswap_i32(i32) -> i32 {
+block0(v0: i32):
+    v1 = bswap v0
+    return v1
+}
+; run: %bswap_i32(0) == 0
+; run: %bswap_i32(1) == 0x01000000
+; run: %bswap_i32(0x12345678) == 0x78563412
+; run: %bswap_i32(-2) == 0xFEFFFFFF
+
+function %bswap_i64(i64) -> i64 {
+block0(v0: i64):
+    v1 = bswap v0
+    return v1
+}
+; run: %bswap_i64(0) == 0
+; run: %bswap_i64(1) == 0x0100000000000000
+; run: %bswap_i64(0x123456789ABCDEF0) == 0xF0DEBC9A78563412
+; run: %bswap_i64(-2) == 0xFEFFFFFFFFFFFFFF
+
+function %fuzzer_case_0() -> i8, i32, i64 {
+block0:
+    v5 = iconst.i64 0x9903_5204_d05f_abab
+    v6 = bswap v5
+    v7 = iconst.i8 0
+    v8 = iconst.i32 0
+    return v7, v8, v6
+}
+
+; run: %fuzzer_case_0() == [0, 0, 0xabab_5fd0_0452_0399]
+
+function %fuzzer_case_1(f32, f64, i32, i32, f64) -> i8, i32, i64 {
+block0(v0: f32, v1: f64, v2: i32, v3: i32, v4: f64):
+    v5 = iconst.i64 0x9903_5204_d05f_abab
+    v6 = bswap v5
+    v7 = iconst.i8 0
+    v8 = iconst.i32 0
+    return v7, v8, v6
+}
+
+; run: %fuzzer_case_1(0.0, 0.0, 0, 0, 0.0) == [0, 0, 0xabab_5fd0_0452_0399]
+
--- a/cranelift/filetests/filetests/runtests/i128-bswap.clif
+++ b/cranelift/filetests/filetests/runtests/i128-bswap.clif
@@ -0,0 +1,12 @@
+test interpret
+
+function %bswap_i128(i128) -> i128 {
+block0(v0: i128):
+    v1 = bswap v0
+    return v1
+}
+; run: %bswap_i128(0) == 0
+; run: %bswap_i128(1) == 0x01000000_00000000_00000000_00000000
+; run: %bswap_i128(0x12345678_9ABCDEF0_CAFEF00D_F00DCAFE) == 0xFECA0DF0_0DF0FECA_F0DEBC9A_78563412
+; run: %bswap_i128(-2) == 0xFEFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF
+
--- a/cranelift/fuzzgen/src/function_generator.rs
+++ b/cranelift/fuzzgen/src/function_generator.rs
@@ -671,6 +671,13 @@ const OPCODE_SIGNATURES: &'static [(
    (Opcode::Bmask, &[I32], &[I128], insert_opcode),
    (Opcode::Bmask, &[I64], &[I128], insert_opcode),
    (Opcode::Bmask, &[I128], &[I128], insert_opcode),
+    // Bswap
+    (Opcode::Bswap, &[I16, I16], &[I16], insert_opcode),
+    (Opcode::Bswap, &[I32, I32], &[I32], insert_opcode),
+    (Opcode::Bswap, &[I64, I64], &[I64], insert_opcode),
+    // I128 version not yet implemented.
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    (Opcode::Bswap, &[I128, I128], &[I128], insert_opcode),
    // Fadd
    (Opcode::Fadd, &[F32, F32], &[F32], insert_opcode),
    (Opcode::Fadd, &[F64, F64], &[F64], insert_opcode),
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -806,6 +806,7 @@ where
        Opcode::UshrImm => binary_unsigned(Value::ushr, arg(0)?, imm_as_ctrl_ty()?)?,
        Opcode::SshrImm => binary(Value::ishr, arg(0)?, imm_as_ctrl_ty()?)?,
        Opcode::Bitrev => assign(Value::reverse_bits(arg(0)?)?),
+        Opcode::Bswap => assign(Value::swap_bytes(arg(0)?)?),
        Opcode::Clz => assign(arg(0)?.leading_zeros()?),
        Opcode::Cls => {
            let count = if Value::lt(&arg(0)?, &Value::int(0, ctrl_ty)?)? {
--- a/cranelift/interpreter/src/value.rs
+++ b/cranelift/interpreter/src/value.rs
@@ -86,6 +86,7 @@ pub trait Value: Clone + From<DataValue> {
    fn leading_zeros(self) -> ValueResult<Self>;
    fn trailing_zeros(self) -> ValueResult<Self>;
    fn reverse_bits(self) -> ValueResult<Self>;
+    fn swap_bytes(self) -> ValueResult<Self>;
 }

 #[derive(Error, Debug, PartialEq)]
@@ -716,4 +717,8 @@ impl Value for DataValue {
    fn reverse_bits(self) -> ValueResult<Self> {
        unary_match!(reverse_bits(&self); [I8, I16, I32, I64, I128, U8, U16, U32, U64, U128])
    }
+
+    fn swap_bytes(self) -> ValueResult<Self> {
+        unary_match!(swap_bytes(&self); [I16, I32, I64, I128, U16, U32, U64, U128])
+    }
 }