diff --git a/build.rs b/build.rs
index 3841b85303..27ab619076 100644
--- a/build.rs
+++ b/build.rs
@@ -180,6 +180,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
         },
         "Cranelift" => match (testsuite, testname) {
             ("simd", "simd_store") => return false,
+            ("simd", "simd_i8x16_cmp") => return false,
             // Most simd tests are known to fail on aarch64 for now, it's going
             // to be a big chunk of work to implement them all there!
             ("simd", _) if target.contains("aarch64") => return true,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index c0cbdd1f25..4d257aee4f 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -342,6 +342,12 @@ fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
     (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
 }
 
+fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16);
+    let bits = 0b0_1_1_01110_00_10000_00000_10_00000_00000;
+    bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg())
+}
+
 /// State carried between emissions of a sequence of instructions.
 #[derive(Default, Clone, Debug)]
 pub struct EmitState {
@@ -1002,6 +1008,15 @@ impl MachInstEmit for Inst {
                 };
                 sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
             }
+            &Inst::VecMisc { op, rd, rn, ty } => {
+                let bits_12_16 = match op {
+                    VecMisc2::Not => {
+                        debug_assert_eq!(I8X16, ty);
+                        0b00101
+                    }
+                };
+                sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn));
+            }
             &Inst::FpuCmp32 { rn, rm } => {
                 sink.put4(enc_fcmp(InstSize::Size32, rn, rm));
             }
@@ -1125,12 +1140,40 @@ impl MachInstEmit for Inst {
                         | machreg_to_gpr(rd.to_reg()),
                 );
             }
-            &Inst::VecRRR { rd, rn, rm, alu_op } => {
+            &Inst::VecRRR {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                ty,
+            } => {
+                let enc_size_for_cmp = match ty {
+                    I8X16 => 0b00,
+                    _ => 0,
+                };
+
                 let (top11, bit15_10) = match alu_op {
-                    VecALUOp::SQAddScalar => (0b010_11110_11_1, 0b000011),
-                    VecALUOp::SQSubScalar => (0b010_11110_11_1, 0b001011),
-                    VecALUOp::UQAddScalar => (0b011_11110_11_1, 0b000011),
-                    VecALUOp::UQSubScalar => (0b011_11110_11_1, 0b001011),
+                    VecALUOp::SQAddScalar => {
+                        debug_assert_eq!(I64, ty);
+                        (0b010_11110_11_1, 0b000011)
+                    }
+                    VecALUOp::SQSubScalar => {
+                        debug_assert_eq!(I64, ty);
+                        (0b010_11110_11_1, 0b001011)
+                    }
+                    VecALUOp::UQAddScalar => {
+                        debug_assert_eq!(I64, ty);
+                        (0b011_11110_11_1, 0b000011)
+                    }
+                    VecALUOp::UQSubScalar => {
+                        debug_assert_eq!(I64, ty);
+                        (0b011_11110_11_1, 0b001011)
+                    }
+                    VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b100011),
+                    VecALUOp::Cmge => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
+                    VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
+                    VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
+                    VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
                 };
                 sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
             }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 1dd6be20eb..8507100401 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1802,6 +1802,7 @@ fn test_aarch64_binemit() {
             rn: vreg(22),
             rm: vreg(23),
             alu_op: VecALUOp::UQAddScalar,
+            ty: I64,
         },
         "D50EF77E",
         "uqadd d21, d22, d23",
@@ -1812,6 +1813,7 @@ fn test_aarch64_binemit() {
             rn: vreg(22),
             rm: vreg(23),
             alu_op: VecALUOp::SQAddScalar,
+            ty: I64,
         },
         "D50EF75E",
         "sqadd d21, d22, d23",
@@ -1822,6 +1824,7 @@ fn test_aarch64_binemit() {
             rn: vreg(22),
             rm: vreg(23),
             alu_op: VecALUOp::UQSubScalar,
+            ty: I64,
         },
         "D52EF77E",
         "uqsub d21, d22, d23",
@@ -1832,10 +1835,83 @@ fn test_aarch64_binemit() {
             rn: vreg(22),
             rm: vreg(23),
             alu_op: VecALUOp::SQSubScalar,
+            ty: I64,
         },
         "D52EF75E",
         "sqsub d21, d22, d23",
     ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmeq,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            ty: I8X16,
+        },
+        "E38E386E",
+        "cmeq v3.16b, v23.16b, v24.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmgt,
+            rd: writable_vreg(3),
+            rn: vreg(23),
+            rm: vreg(24),
+            ty: I8X16,
+        },
+        "E336384E",
+        "cmgt v3.16b, v23.16b, v24.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmge,
+            rd: writable_vreg(23),
+            rn: vreg(9),
+            rm: vreg(12),
+            ty: I8X16,
+        },
+        "373D2C4E",
+        "cmge v23.16b, v9.16b, v12.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhi,
+            rd: writable_vreg(5),
+            rn: vreg(1),
+            rm: vreg(1),
+            ty: I8X16,
+        },
+        "2534216E",
+        "cmhi v5.16b, v1.16b, v1.16b",
+    ));
+
+    insns.push((
+        Inst::VecRRR {
+            alu_op: VecALUOp::Cmhs,
+            rd: writable_vreg(8),
+            rn: vreg(2),
+            rm: vreg(15),
+            ty: I8X16,
+        },
+        "483C2F6E",
+        "cmhs v8.16b, v2.16b, v15.16b",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Not,
+            rd: writable_vreg(2),
+            rn: vreg(1),
+            ty: I8X16,
+        },
+        "2258206E",
+        "mvn v2.16b, v1.16b",
+    ));
+
     insns.push((
         Inst::Extend {
             rd: writable_xreg(1),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 1cf307d1d0..7818092565 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -5,7 +5,7 @@
 
 use crate::binemit::CodeOffset;
 use crate::ir::types::{
-    B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I128, I16, I32, I64, I8, I8X16, IFLAGS,
+    B1, B16, B32, B64, B8, B8X16, F32, F32X2, F64, FFLAGS, I128, I16, I32, I64, I8, I8X16, IFLAGS,
 };
 use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
 use crate::machinst::*;
@@ -197,6 +197,23 @@ pub enum VecALUOp {
     SQSubScalar,
     /// Unsigned saturating subtract
     UQSubScalar,
+    /// Compare bitwise equal
+    Cmeq,
+    /// Compare signed greater than or equal
+    Cmge,
+    /// Compare signed greater than
+    Cmgt,
+    /// Compare unsigned higher
+    Cmhs,
+    /// Compare unsigned higher or same
+    Cmhi,
+}
+
+/// A Vector miscellaneous operation with two registers.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecMisc2 {
+    /// Bitwise NOT.
+    Not,
 }
 
 /// An operation on the bits of a register. This can be paired with several instruction formats
@@ -626,6 +643,15 @@ pub enum Inst {
         rd: Writable<Reg>,
         rn: Reg,
         rm: Reg,
+        ty: Type,
+    },
+
+    /// Vector two register miscellaneous instruction.
+    VecMisc {
+        op: VecMisc2,
+        rd: Writable<Reg>,
+        rn: Reg,
+        ty: Type,
     },
 
     /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
@@ -1096,6 +1122,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_use(rm);
             collector.add_use(ra);
         }
+        &Inst::VecMisc { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
         &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
             collector.add_use(rn);
             collector.add_use(rm);
@@ -1567,6 +1597,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
             map_use(mapper, rm);
             map_use(mapper, ra);
         }
+        &mut Inst::VecMisc {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
         &mut Inst::FpuCmp32 {
             ref mut rn,
             ref mut rm,
@@ -1909,6 +1947,7 @@ impl MachInst for Inst {
             F32 | F64 => Ok(RegClass::V128),
             IFLAGS | FFLAGS => Ok(RegClass::I64),
             I8X16 => Ok(RegClass::V128),
+            B8X16 => Ok(RegClass::V128),
             _ => Err(CodegenError::Unsupported(format!(
                 "Unexpected SSA-value type: {}",
                 ty
@@ -2482,18 +2521,45 @@ impl ShowWithRRU for Inst {
                 let rn = rn.show_rru(mb_rru);
                 format!("mov {}, {}.d[0]", rd, rn)
             }
-            &Inst::VecRRR { rd, rn, rm, alu_op } => {
-                let op = match alu_op {
-                    VecALUOp::SQAddScalar => "sqadd",
-                    VecALUOp::UQAddScalar => "uqadd",
-                    VecALUOp::SQSubScalar => "sqsub",
-                    VecALUOp::UQSubScalar => "uqsub",
+            &Inst::VecRRR {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                ty,
+            } => {
+                let (op, vector) = match alu_op {
+                    VecALUOp::SQAddScalar => ("sqadd", false),
+                    VecALUOp::UQAddScalar => ("uqadd", false),
+                    VecALUOp::SQSubScalar => ("sqsub", false),
+                    VecALUOp::UQSubScalar => ("uqsub", false),
+                    VecALUOp::Cmeq => ("cmeq", true),
+                    VecALUOp::Cmge => ("cmge", true),
+                    VecALUOp::Cmgt => ("cmgt", true),
+                    VecALUOp::Cmhs => ("cmhs", true),
+                    VecALUOp::Cmhi => ("cmhi", true),
                 };
-                let rd = show_vreg_scalar(rd.to_reg(), mb_rru);
-                let rn = show_vreg_scalar(rn, mb_rru);
-                let rm = show_vreg_scalar(rm, mb_rru);
+
+                let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector {
+                    |reg, mb_rru, ty| show_vreg_vector(reg, mb_rru, ty)
+                } else {
+                    |reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru)
+                };
+
+                let rd = show_vreg_fn(rd.to_reg(), mb_rru, ty);
+                let rn = show_vreg_fn(rn, mb_rru, ty);
+                let rm = show_vreg_fn(rm, mb_rru, ty);
                 format!("{} {}, {}, {}", op, rd, rn, rm)
             }
+            &Inst::VecMisc { op, rd, rn, ty } => {
+                let op = match op {
+                    VecMisc2::Not => "mvn",
+                };
+
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, ty);
+                let rn = show_vreg_vector(rn, mb_rru, ty);
+                format!("{} {}, {}", op, rd, rn)
+            }
             &Inst::MovToNZCV { rn } => {
                 let rn = rn.show_rru(mb_rru);
                 format!("msr nzcv, {}", rn)
diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
index 7e13e33ac8..cebcf6ec30 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -319,6 +319,7 @@ pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) ->
     let mut s = reg.show_rru(mb_rru);
 
     match ty {
+        I8X16 => s.push_str(".16b"),
         F32X2 => s.push_str(".2s"),
         _ => unimplemented!(),
     }
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 68ad4017e1..10db3b1f07 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -277,6 +277,7 @@ pub(crate) fn input_to_reg<C: LowerCtx<I = Inst>>(
             tmp.to_reg()
         }
         (_, 64) => in_reg,
+        (_, 128) => in_reg,
 
         _ => panic!(
             "Unsupported input width: input ty {} bits {} mode {:?}",
@@ -712,7 +713,7 @@ pub fn ty_bits(ty: Type) -> usize {
         B64 | I64 | F64 => 64,
         B128 | I128 => 128,
         IFLAGS | FFLAGS => 32,
-        I8X16 => 128,
+        I8X16 | B8X16 => 128,
         _ => panic!("ty_bits() on unknown type: {:?}", ty),
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 2faa66941f..95bf050958 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -7,7 +7,7 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode};
 use crate::machinst::lower::*;
 use crate::machinst::*;
-use crate::CodegenResult;
+use crate::{CodegenError, CodegenResult};
 
 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
@@ -96,6 +96,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 rn: va.to_reg(),
                 rm: vb.to_reg(),
                 alu_op,
+                ty: I64,
             });
             ctx.emit(Inst::MovFromVec64 {
                 rd,
@@ -127,6 +128,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 rn: va.to_reg(),
                 rm: vb.to_reg(),
                 alu_op,
+                ty: I64,
             });
             ctx.emit(Inst::MovFromVec64 {
                 rd,
@@ -1152,12 +1154,66 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 (false, true) => NarrowValueMode::SignExtend64,
                 (false, false) => NarrowValueMode::ZeroExtend64,
             };
-            let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
-            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
-            let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
-            let rd = output_to_reg(ctx, outputs[0]);
-            ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
-            ctx.emit(Inst::CondSet { cond, rd });
+
+            if ty_bits(ty) < 128 {
+                let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
+                let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+                let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
+                let rd = output_to_reg(ctx, outputs[0]);
+                ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
+                ctx.emit(Inst::CondSet { cond, rd });
+            } else {
+                if ty != I8X16 {
+                    return Err(CodegenError::Unsupported(format!(
+                        "unsupported simd type: {:?}",
+                        ty
+                    )));
+                }
+
+                let mut rn = input_to_reg(ctx, inputs[0], narrow_mode);
+                let mut rm = input_to_reg(ctx, inputs[1], narrow_mode);
+                let rd = output_to_reg(ctx, outputs[0]);
+
+                // 'Less than' operations are implemented by swapping
+                // the order of operands and using the 'greater than'
+                // instructions.
+                // 'Not equal' is implemented with 'equal' and inverting
+                // the result.
+                let (alu_op, swap) = match cond {
+                    Cond::Eq => (VecALUOp::Cmeq, false),
+                    Cond::Ne => (VecALUOp::Cmeq, false),
+                    Cond::Ge => (VecALUOp::Cmge, false),
+                    Cond::Gt => (VecALUOp::Cmgt, false),
+                    Cond::Le => (VecALUOp::Cmge, true),
+                    Cond::Lt => (VecALUOp::Cmgt, true),
+                    Cond::Hs => (VecALUOp::Cmhs, false),
+                    Cond::Hi => (VecALUOp::Cmhi, false),
+                    Cond::Ls => (VecALUOp::Cmhs, true),
+                    Cond::Lo => (VecALUOp::Cmhi, true),
+                    _ => unreachable!(),
+                };
+
+                if swap {
+                    std::mem::swap(&mut rn, &mut rm);
+                }
+
+                ctx.emit(Inst::VecRRR {
+                    alu_op,
+                    rd,
+                    rn,
+                    rm,
+                    ty,
+                });
+
+                if cond == Cond::Ne {
+                    ctx.emit(Inst::VecMisc {
+                        op: VecMisc2::Not,
+                        rd,
+                        rn: rd.to_reg(),
+                        ty: I8X16,
+                    });
+                }
+            }
         }
 
         Opcode::Fcmp => {
@@ -1350,6 +1406,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             lower_constant_f128(ctx, rd, value);
         }
 
+        Opcode::RawBitcast => {
+            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            ctx.emit(Inst::gen_move(rd, rm, ty));
+        }
+
         Opcode::Shuffle
         | Opcode::Vsplit
         | Opcode::Vconcat
@@ -1359,7 +1422,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::Splat
         | Opcode::Insertlane
         | Opcode::Extractlane
-        | Opcode::RawBitcast
         | Opcode::ScalarToVector
         | Opcode::Swizzle
         | Opcode::Uload8x8