diff --git a/build.rs b/build.rs
index a6e2b7d8bc..28b1ca3636 100644
--- a/build.rs
+++ b/build.rs
@@ -179,7 +179,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             _ => (),
         },
         "Cranelift" => match (testsuite, testname) {
-            // All simd tests are known to fail on aarch64 for now, it's going
+            ("simd", "simd_store") => return false,
+            // Most simd tests are known to fail on aarch64 for now, it's going
             // to be a big chunk of work to implement them all there!
             ("simd", _) if target.contains("aarch64") => return true,
 
diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
index 8f388665b5..23b824fdb7 100644
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -280,7 +280,7 @@ fn in_int_reg(ty: ir::Type) -> bool {
 
 fn in_vec_reg(ty: ir::Type) -> bool {
     match ty {
-        types::F32 | types::F64 => true,
+        types::F32 | types::F64 | types::I8X16 => true,
         _ => false,
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index 4b8142fbe5..3648eba2d0 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -480,11 +480,14 @@ impl ShowWithRRU for BranchTarget {
 }
 
 /// Type used to communicate the operand size of a machine instruction, as AArch64 has 32- and
-/// 64-bit variants of many instructions (and integer registers).
+/// 64-bit variants of many instructions (and integer and floating-point registers) and 128-bit
+/// variants of vector instructions.
+/// TODO: Create a separate type for SIMD & floating-point operands.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum InstSize {
     Size32,
     Size64,
+    Size128,
 }
 
 impl InstSize {
@@ -507,11 +510,13 @@ impl InstSize {
     /// Convert from a needed width to the smallest size that fits.
     pub fn from_bits<I: Into<usize>>(bits: I) -> InstSize {
         let bits: usize = bits.into();
-        assert!(bits <= 64);
+        assert!(bits <= 128);
         if bits <= 32 {
             InstSize::Size32
-        } else {
+        } else if bits <= 64 {
             InstSize::Size64
+        } else {
+            InstSize::Size128
         }
     }
 
@@ -520,11 +525,12 @@ impl InstSize {
         Self::from_bits(ty_bits(ty))
     }
 
-    /// Convert to I32 or I64.
+    /// Convert to I32, I64, or I128.
     pub fn to_ty(self) -> Type {
         match self {
             InstSize::Size32 => I32,
             InstSize::Size64 => I64,
+            InstSize::Size128 => I128,
         }
     }
 
@@ -532,6 +538,9 @@ impl InstSize {
         match self {
             InstSize::Size32 => 0,
             InstSize::Size64 => 1,
+            _ => {
+                panic!("Unexpected size");
+            }
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 1bf59814ba..c0cbdd1f25 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -295,8 +295,8 @@ fn enc_ccmp_imm(size: InstSize, rn: Reg, imm: UImm5, nzcv: NZCV, cond: Cond) ->
 }
 
 fn enc_vecmov(is_16b: bool, rd: Writable<Reg>, rn: Reg) -> u32 {
-    debug_assert!(!is_16b); // to be supported later.
     0b00001110_101_00000_00011_1_00000_00000
+        | ((is_16b as u32) << 30)
         | machreg_to_vec(rd.to_reg())
         | (machreg_to_vec(rn) << 16)
         | (machreg_to_vec(rn) << 5)
@@ -918,6 +918,9 @@ impl MachInstEmit for Inst {
             &Inst::FpuMove64 { rd, rn } => {
                 sink.put4(enc_vecmov(/* 16b = */ false, rd, rn));
             }
+            &Inst::FpuMove128 { rd, rn } => {
+                sink.put4(enc_vecmov(/* 16b = */ true, rd, rn));
+            }
             &Inst::FpuRR { fpu_op, rd, rn } => {
                 let top22 = match fpu_op {
                     FPUOp1::Abs32 => 0b000_11110_00_1_000001_10000,
@@ -1073,6 +1076,22 @@ impl MachInstEmit for Inst {
                 inst.emit(sink, flags, state);
                 sink.put8(const_data.to_bits());
             }
+            &Inst::LoadFpuConst128 { rd, const_data } => {
+                let inst = Inst::FpuLoad128 {
+                    rd,
+                    mem: MemArg::Label(MemLabel::PCRel(8)),
+                    srcloc: None,
+                };
+                inst.emit(sink, flags, state);
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(20),
+                };
+                inst.emit(sink, flags, state);
+
+                for i in const_data.to_le_bytes().iter() {
+                    sink.put1(*i);
+                }
+            }
             &Inst::FpuCSel32 { rd, rn, rm, cond } => {
                 sink.put4(enc_fcsel(rd, rn, rm, cond, InstSize::Size32));
             }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index b948f4fd8c..1dd6be20eb 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2164,6 +2164,15 @@ fn test_aarch64_binemit() {
         "mov v8.8b, v4.8b",
     ));
 
+    insns.push((
+        Inst::FpuMove128 {
+            rd: writable_vreg(17),
+            rn: vreg(26),
+        },
+        "511FBA4E",
+        "mov v17.16b, v26.16b",
+    ));
+
     insns.push((
         Inst::FpuRR {
             fpu_op: FPUOp1::Abs32,
@@ -2726,6 +2735,15 @@ fn test_aarch64_binemit() {
         "ldr d16, pc+8 ; b 12 ; data.f64 1",
     ));
 
+    insns.push((
+        Inst::LoadFpuConst128 {
+            rd: writable_vreg(5),
+            const_data: 0x0f0e0d0c0b0a09080706050403020100,
+        },
+        "4500009C05000014000102030405060708090A0B0C0D0E0F",
+        "ldr q5, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100",
+    ));
+
     insns.push((
         Inst::FpuCSel32 {
             rd: writable_vreg(1),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 989ce96f6c..1cf307d1d0 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -4,7 +4,9 @@
 #![allow(dead_code)]
 
 use crate::binemit::CodeOffset;
-use crate::ir::types::{B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I16, I32, I64, I8, IFLAGS};
+use crate::ir::types::{
+    B1, B16, B32, B64, B8, F32, F32X2, F64, FFLAGS, I128, I16, I32, I64, I8, I8X16, IFLAGS,
+};
 use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
 use crate::machinst::*;
 use crate::{settings, CodegenError, CodegenResult};
@@ -470,6 +472,12 @@ pub enum Inst {
         rn: Reg,
     },
 
+    /// Vector register move.
+    FpuMove128 {
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
     /// 1-op FPU instruction.
     FpuRR {
         fpu_op: FPUOp1,
@@ -559,6 +567,11 @@ pub enum Inst {
         const_data: f64,
     },
 
+    LoadFpuConst128 {
+        rd: Writable<Reg>,
+        const_data: u128,
+    },
+
     /// Conversion: FP -> integer.
     FpuToInt {
         op: FpuToIntOp,
@@ -816,6 +829,11 @@ impl Inst {
                 rd: to_reg,
                 rm: from_reg,
             }
+        } else if from_reg.get_class() == RegClass::V128 {
+            Inst::FpuMove128 {
+                rd: to_reg,
+                rn: from_reg,
+            }
         } else {
             Inst::FpuMove64 {
                 rd: to_reg,
@@ -905,6 +923,14 @@ impl Inst {
             const_data: value,
         }
     }
+
+    /// Create an instruction that loads a 128-bit vector constant.
+    pub fn load_fp_constant128(rd: Writable<Reg>, value: u128) -> Inst {
+        Inst::LoadFpuConst128 {
+            rd,
+            const_data: value,
+        }
+    }
 }
 
 //=============================================================================
@@ -1044,6 +1070,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_def(rd);
             collector.add_use(rn);
         }
+        &Inst::FpuMove128 { rd, rn } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
         &Inst::FpuRR { rd, rn, .. } => {
             collector.add_def(rd);
             collector.add_use(rn);
@@ -1094,7 +1124,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_use(rd);
             memarg_regs(mem, collector);
         }
-        &Inst::LoadFpuConst32 { rd, .. } | &Inst::LoadFpuConst64 { rd, .. } => {
+        &Inst::LoadFpuConst32 { rd, .. }
+        | &Inst::LoadFpuConst64 { rd, .. }
+        | &Inst::LoadFpuConst128 { rd, .. } => {
             collector.add_def(rd);
         }
         &Inst::FpuToInt { rd, rn, .. } => {
@@ -1490,6 +1522,13 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
             map_def(mapper, rd);
             map_use(mapper, rn);
         }
+        &mut Inst::FpuMove128 {
+            ref mut rd,
+            ref mut rn,
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
         &mut Inst::FpuRR {
             ref mut rd,
             ref mut rn,
@@ -1596,6 +1635,9 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
         &mut Inst::LoadFpuConst64 { ref mut rd, .. } => {
             map_def(mapper, rd);
         }
+        &mut Inst::LoadFpuConst128 { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
         &mut Inst::FpuToInt {
             ref mut rd,
             ref mut rn,
@@ -1780,6 +1822,7 @@ impl MachInst for Inst {
         match self {
             &Inst::Mov { rd, rm } => Some((rd, rm)),
             &Inst::FpuMove64 { rd, rn } => Some((rd, rn)),
+            &Inst::FpuMove128 { rd, rn } => Some((rd, rn)),
             _ => None,
         }
     }
@@ -1813,7 +1856,7 @@ impl MachInst for Inst {
     }
 
     fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst {
-        assert!(ty.bits() <= 64); // no vector support yet!
+        assert!(ty.bits() <= 128);
         Inst::mov(to_reg, from_reg)
     }
 
@@ -1865,6 +1908,7 @@ impl MachInst for Inst {
             I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => Ok(RegClass::I64),
             F32 | F64 => Ok(RegClass::V128),
             IFLAGS | FFLAGS => Ok(RegClass::I64),
+            I8X16 => Ok(RegClass::V128),
             _ => Err(CodegenError::Unsupported(format!(
                 "Unexpected SSA-value type: {}",
                 ty
@@ -2235,6 +2279,11 @@ impl ShowWithRRU for Inst {
                 let rn = rn.show_rru(mb_rru);
                 format!("mov {}.8b, {}.8b", rd, rn)
             }
+            &Inst::FpuMove128 { rd, rn } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                format!("mov {}.16b, {}.16b", rd, rn)
+            }
             &Inst::FpuRR { fpu_op, rd, rn } => {
                 let (op, sizesrc, sizedest) = match fpu_op {
                     FPUOp1::Abs32 => ("fabs", InstSize::Size32, InstSize::Size32),
@@ -2360,6 +2409,10 @@ impl ShowWithRRU for Inst {
                 let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size64);
                 format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, const_data)
             }
+            &Inst::LoadFpuConst128 { rd, const_data } => {
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size128);
+                format!("ldr {}, pc+8 ; b 20 ; data.f128 0x{:032x}", rd, const_data)
+            }
             &Inst::FpuToInt { op, rd, rn } => {
                 let (op, sizesrc, sizedest) = match op {
                     FpuToIntOp::F32ToI32 => ("fcvtzs", InstSize::Size32, InstSize::Size32),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
index 242fb66fc9..7e13e33ac8 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -276,13 +276,17 @@ pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSiz
     s
 }
 
-/// Show a vector register when its use as a 32-bit or 64-bit float is known.
+/// Show a vector register.
 pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSize) -> String {
     let mut s = reg.show_rru(mb_rru);
     if reg.get_class() != RegClass::V128 {
         return s;
     }
-    let prefix = if size.is32() { "s" } else { "d" };
+    let prefix = match size {
+        InstSize::Size32 => "s",
+        InstSize::Size64 => "d",
+        InstSize::Size128 => "q",
+    };
     s.replace_range(0..1, prefix);
     s
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index eb4aafd551..68ad4017e1 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -142,6 +142,31 @@ pub(crate) fn input_to_shiftimm<C: LowerCtx<I = Inst>>(
     input_to_const(ctx, input).and_then(ShiftOpShiftImm::maybe_from_shift)
 }
 
+pub(crate) fn output_to_const_f128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    out: InsnOutput,
+) -> Option<u128> {
+    if out.output > 0 {
+        None
+    } else {
+        let inst_data = ctx.data(out.insn);
+
+        match inst_data {
+            &InstructionData::UnaryConst {
+                opcode: _,
+                constant_handle,
+            } => {
+                let mut bytes = [0u8; 16];
+                let c = ctx.get_constant_data(constant_handle).clone().into_vec();
+                assert_eq!(c.len(), 16);
+                bytes.copy_from_slice(&c);
+                Some(u128::from_le_bytes(bytes))
+            }
+            _ => None,
+        }
+    }
+}
+
 /// How to handle narrow values loaded into registers; see note on `narrow_mode`
 /// parameter to `input_to_*` below.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -588,6 +613,14 @@ pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
     ctx.emit(Inst::load_fp_constant64(rd, value));
 }
 
+pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    value: u128,
+) {
+    ctx.emit(Inst::load_fp_constant128(rd, value));
+}
+
 pub(crate) fn lower_condcode(cc: IntCC) -> Cond {
     match cc {
         IntCC::Equal => Cond::Eq,
@@ -679,6 +712,7 @@ pub fn ty_bits(ty: Type) -> usize {
         B64 | I64 | F64 => 64,
         B128 | I128 => 128,
         IFLAGS | FFLAGS => 32,
+        I8X16 => 128,
         _ => panic!("ty_bits() on unknown type: {:?}", ty),
     }
 }
@@ -686,7 +720,7 @@ pub fn ty_bits(ty: Type) -> usize {
 pub(crate) fn ty_is_int(ty: Type) -> bool {
     match ty {
         B1 | B8 | I8 | B16 | I16 | B32 | I32 | B64 | I64 => true,
-        F32 | F64 | B128 | I128 => false,
+        F32 | F64 | B128 | I128 | I8X16 => false,
         IFLAGS | FFLAGS => panic!("Unexpected flags type"),
         _ => panic!("ty_is_int() on unknown type: {:?}", ty),
     }
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 2946e16471..2faa66941f 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -875,6 +875,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 (32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc },
                 (64, _, false) => Inst::ULoad64 { rd, mem, srcloc },
                 (64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc },
+                (128, _, _) => Inst::FpuLoad128 { rd, mem, srcloc },
                 _ => panic!("Unsupported size in load"),
             });
         }
@@ -914,6 +915,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 (32, true) => Inst::FpuStore32 { rd, mem, srcloc },
                 (64, false) => Inst::Store64 { rd, mem, srcloc },
                 (64, true) => Inst::FpuStore64 { rd, mem, srcloc },
+                (128, _) => Inst::FpuStore128 { rd, mem, srcloc },
                 _ => panic!("Unsupported size in store"),
             });
         }
@@ -1342,8 +1344,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             panic!("Branch opcode reached non-branch lowering logic!");
         }
 
-        Opcode::Vconst
-        | Opcode::Shuffle
+        Opcode::Vconst => {
+            let value = output_to_const_f128(ctx, outputs[0]).unwrap();
+            let rd = output_to_reg(ctx, outputs[0]);
+            lower_constant_f128(ctx, rd, value);
+        }
+
+        Opcode::Shuffle
         | Opcode::Vsplit
         | Opcode::Vconcat
         | Opcode::Vselect
diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs
index 76663450ba..fabfdecc6a 100644
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -7,8 +7,8 @@ use crate::fx::{FxHashMap, FxHashSet};
 use crate::inst_predicates::{has_side_effect_or_load, is_constant_64bit};
 use crate::ir::instructions::BranchInfo;
 use crate::ir::{
-    ArgumentExtension, Block, ExternalName, Function, GlobalValueData, Inst, InstructionData,
-    MemFlags, Opcode, Signature, SourceLoc, Type, Value, ValueDef,
+    ArgumentExtension, Block, Constant, ConstantData, ExternalName, Function, GlobalValueData,
+    Inst, InstructionData, MemFlags, Opcode, Signature, SourceLoc, Type, Value, ValueDef,
 };
 use crate::machinst::{
     ABIBody, BlockIndex, BlockLoweringOrder, LoweredBlock, MachLabel, VCode, VCodeBuilder,
@@ -145,6 +145,8 @@ pub trait LowerCtx {
     /// `get_input()`. Codegen may not happen otherwise for the producing
     /// instruction if it has no side effects and no uses.
     fn use_input_reg(&mut self, input: LowerInput);
+    /// Retrieve constant data given a handle.
+    fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData;
 }
 
 /// A representation of all of the ways in which an instruction input is
@@ -913,6 +915,10 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
         debug!("use_input_reg: vreg {:?} is needed", input.reg);
         self.vreg_needed[input.reg.get_index()] = true;
     }
+
+    fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData {
+        self.f.dfg.constants.get(constant_handle)
+    }
 }
 
 /// Visit all successors of a block with a given visitor closure.