Merge branch 'main' into peepmatic-bnot

2020-07-16 16:13:28 +03:00
parent 657aea5286 a9455a8e51
commit 4564c396d2
49 changed files with 861 additions and 491 deletions
--- a/build.rs
+++ b/build.rs
@@ -202,8 +202,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
            // to be a big chunk of work to implement them all there!
            ("simd", _) if target.contains("aarch64") => return true,

-            ("simd", "simd_conversions") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S
-
            // TODO(#1886): Ignore reference types tests if this isn't x64,
            // because Cranelift only supports reference types on x64.
            ("reference_types", _) => {
--- a/cranelift/Cargo.toml
+++ b/cranelift/Cargo.toml
@@ -48,3 +48,4 @@ default = ["disas", "wasm", "cranelift-codegen/all-arch"]
 disas = ["capstone"]
 enable-peepmatic = ["cranelift-codegen/enable-peepmatic", "cranelift-filetests/enable-peepmatic"]
 wasm = ["wat", "cranelift-wasm"]
+experimental_x64 = ["cranelift-codegen/x64"]
--- a/cranelift/codegen/Cargo.toml
+++ b/cranelift/codegen/Cargo.toml
@@ -66,7 +66,6 @@ x64 = [] # New work-in-progress codegen backend for x86_64 based on the new isel
 # Option to enable all architectures.
 all-arch = [
    "x86",
-    "x64",
    "arm32",
    "arm64",
    "riscv"
--- a/cranelift/codegen/build.rs
+++ b/cranelift/codegen/build.rs
@@ -26,7 +26,15 @@ fn main() {
    let out_dir = env::var("OUT_DIR").expect("The OUT_DIR environment variable must be set");
    let target_triple = env::var("TARGET").expect("The TARGET environment variable must be set");

-    // Configure isa targets cfg.
+    let new_backend_isas = if env::var("CARGO_FEATURE_X64").is_ok() {
+        // The x64 (new backend for x86_64) is a bit particular: it only requires generating
+        // the shared meta code; the only ISA-specific code is for settings.
+        vec![meta::isa::Isa::X86]
+    } else {
+        Vec::new()
+    };
+
+    // Configure isa targets using the old backend.
    let isa_targets = meta::isa::Isa::all()
        .iter()
        .cloned()
@@ -36,7 +44,7 @@ fn main() {
        })
        .collect::<Vec<_>>();

-    let isas = if isa_targets.is_empty() {
+    let old_backend_isas = if new_backend_isas.is_empty() && isa_targets.is_empty() {
        // Try to match native target.
        let target_name = target_triple.split('-').next().unwrap();
        let isa = meta::isa_from_arch(&target_name).expect("error when identifying target");
@@ -56,14 +64,23 @@ fn main() {
        crate_dir.join("build.rs").to_str().unwrap()
    );

-    if let Err(err) = meta::generate(&isas, &out_dir) {
+    if let Err(err) = meta::generate(&old_backend_isas, &new_backend_isas, &out_dir) {
        eprintln!("Error: {}", err);
        process::exit(1);
    }

    if env::var("CRANELIFT_VERBOSE").is_ok() {
-        for isa in &isas {
-            println!("cargo:warning=Includes support for {} ISA", isa.to_string());
+        for isa in &old_backend_isas {
+            println!(
+                "cargo:warning=Includes old-backend support for {} ISA",
+                isa.to_string()
+            );
+        }
+        for isa in &new_backend_isas {
+            println!(
+                "cargo:warning=Includes new-backend support for {} ISA",
+                isa.to_string()
+            );
        }
        println!(
            "cargo:warning=Build step took {:?}.",
--- a/cranelift/codegen/meta/src/cdsl/typevar.rs
+++ b/cranelift/codegen/meta/src/cdsl/typevar.rs
@@ -211,6 +211,24 @@ impl TypeVar {
                    "can't double 256 lanes"
                );
            }
+            DerivedFunc::MergeLanes => {
+                assert!(
+                    ts.ints.is_empty() || *ts.ints.iter().max().unwrap() < MAX_BITS,
+                    "can't double all integer types"
+                );
+                assert!(
+                    ts.floats.is_empty() || *ts.floats.iter().max().unwrap() < MAX_FLOAT_BITS,
+                    "can't double all float types"
+                );
+                assert!(
+                    ts.bools.is_empty() || *ts.bools.iter().max().unwrap() < MAX_BITS,
+                    "can't double all boolean types"
+                );
+                assert!(
+                    *ts.lanes.iter().min().unwrap() > 1,
+                    "can't halve a scalar type"
+                );
+            }
            DerivedFunc::LaneOf | DerivedFunc::AsBool => { /* no particular assertions */ }
        }

@@ -248,6 +266,9 @@ impl TypeVar {
    pub fn split_lanes(&self) -> TypeVar {
        self.derived(DerivedFunc::SplitLanes)
    }
+    pub fn merge_lanes(&self) -> TypeVar {
+        self.derived(DerivedFunc::MergeLanes)
+    }

    /// Constrain the range of types this variable can assume to a subset of those in the typeset
    /// ts.
@@ -355,6 +376,7 @@ pub(crate) enum DerivedFunc {
    HalfVector,
    DoubleVector,
    SplitLanes,
+    MergeLanes,
 }

 impl DerivedFunc {
@@ -367,6 +389,7 @@ impl DerivedFunc {
            DerivedFunc::HalfVector => "half_vector",
            DerivedFunc::DoubleVector => "double_vector",
            DerivedFunc::SplitLanes => "split_lanes",
+            DerivedFunc::MergeLanes => "merge_lanes",
        }
    }

@@ -377,6 +400,8 @@ impl DerivedFunc {
            DerivedFunc::DoubleWidth => Some(DerivedFunc::HalfWidth),
            DerivedFunc::HalfVector => Some(DerivedFunc::DoubleVector),
            DerivedFunc::DoubleVector => Some(DerivedFunc::HalfVector),
+            DerivedFunc::MergeLanes => Some(DerivedFunc::SplitLanes),
+            DerivedFunc::SplitLanes => Some(DerivedFunc::MergeLanes),
            _ => None,
        }
    }
@@ -462,6 +487,7 @@ impl TypeSet {
            DerivedFunc::HalfVector => self.half_vector(),
            DerivedFunc::DoubleVector => self.double_vector(),
            DerivedFunc::SplitLanes => self.half_width().double_vector(),
+            DerivedFunc::MergeLanes => self.double_width().half_vector(),
        }
    }

@@ -601,7 +627,8 @@ impl TypeSet {
            DerivedFunc::DoubleWidth => self.half_width(),
            DerivedFunc::HalfVector => self.double_vector(),
            DerivedFunc::DoubleVector => self.half_vector(),
-            DerivedFunc::SplitLanes => self.half_vector().double_width(),
+            DerivedFunc::SplitLanes => self.double_width().half_vector(),
+            DerivedFunc::MergeLanes => self.half_width().double_vector(),
        }
    }

--- a/cranelift/codegen/meta/src/gen_legalizer.rs
+++ b/cranelift/codegen/meta/src/gen_legalizer.rs
@@ -700,6 +700,7 @@ fn gen_isa(
 pub(crate) fn generate(
    isas: &[TargetIsa],
    transform_groups: &TransformGroups,
+    extra_legalization_groups: &[&'static str],
    filename_prefix: &str,
    out_dir: &str,
 ) -> Result<(), error::Error> {
@@ -711,8 +712,14 @@ pub(crate) fn generate(
        fmt.update_file(format!("{}-{}.rs", filename_prefix, isa.name), out_dir)?;
    }

+    // Add extra legalization groups that were explicitly requested.
+    for group in extra_legalization_groups {
+        shared_group_names.insert(group);
+    }
+
    // Generate shared legalize groups.
    let mut fmt = Formatter::new();
+    // Generate shared legalize groups.
    let mut type_sets = UniqueTable::new();
    let mut sorted_shared_group_names = Vec::from_iter(shared_group_names);
    sorted_shared_group_names.sort();
--- a/cranelift/codegen/meta/src/isa/mod.rs
+++ b/cranelift/codegen/meta/src/isa/mod.rs
@@ -6,10 +6,10 @@ use std::fmt;
 mod arm32;
 mod arm64;
 mod riscv;
-mod x86;
+pub(crate) mod x86;

 /// Represents known ISA target.
-#[derive(Copy, Clone)]
+#[derive(PartialEq, Copy, Clone)]
 pub enum Isa {
    Riscv,
    X86,
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1669,6 +1669,7 @@ fn define_simd(
    let ssub_sat = shared.by_name("ssub_sat");
    let store = shared.by_name("store");
    let store_complex = shared.by_name("store_complex");
+    let swiden_low = shared.by_name("swiden_low");
    let uadd_sat = shared.by_name("uadd_sat");
    let uload8x8 = shared.by_name("uload8x8");
    let uload8x8_complex = shared.by_name("uload8x8_complex");
@@ -1678,6 +1679,7 @@ fn define_simd(
    let uload32x2_complex = shared.by_name("uload32x2_complex");
    let snarrow = shared.by_name("snarrow");
    let unarrow = shared.by_name("unarrow");
+    let uwiden_low = shared.by_name("uwiden_low");
    let ushr_imm = shared.by_name("ushr_imm");
    let usub_sat = shared.by_name("usub_sat");
    let vconst = shared.by_name("vconst");
@@ -1697,6 +1699,7 @@ fn define_simd(
    let x86_pminu = x86.by_name("x86_pminu");
    let x86_pmullq = x86.by_name("x86_pmullq");
    let x86_pmuludq = x86.by_name("x86_pmuludq");
+    let x86_palignr = x86.by_name("x86_palignr");
    let x86_pshufb = x86.by_name("x86_pshufb");
    let x86_pshufd = x86.by_name("x86_pshufd");
    let x86_psll = x86.by_name("x86_psll");
@@ -1901,6 +1904,8 @@ fn define_simd(
            rec_fa.opcodes(low),
        );
    }
+
+    // SIMD narrow/widen
    for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] {
        let snarrow = snarrow.bind(vector(*ty, sse_vector_size));
        e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes));
@@ -1912,6 +1917,23 @@ fn define_simd(
        let unarrow = unarrow.bind(vector(*ty, sse_vector_size));
        e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap);
    }
+    for (ty, swiden_opcode, uwiden_opcode) in &[
+        (I8, &PMOVSXBW[..], &PMOVZXBW[..]),
+        (I16, &PMOVSXWD[..], &PMOVZXWD[..]),
+    ] {
+        let isap = Some(use_sse41_simd);
+        let swiden_low = swiden_low.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(swiden_low, rec_furm.opcodes(*swiden_opcode), isap);
+        let uwiden_low = uwiden_low.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(uwiden_low, rec_furm.opcodes(*uwiden_opcode), isap);
+    }
+    for ty in &[I8, I16, I32, I64] {
+        e.enc_both_inferred_maybe_isap(
+            x86_palignr.bind(vector(*ty, sse_vector_size)),
+            rec_fa_ib.opcodes(&PALIGNR[..]),
+            Some(use_ssse3_simd),
+        );
+    }

    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
    for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -664,6 +664,21 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let c = &Operand::new("c", uimm8)
+        .with_doc("The number of bytes to shift right; see PALIGNR in Intel manual for details");
+    ig.push(
+        Inst::new(
+            "x86_palignr",
+            r#"
+        Concatenate destination and source operands, extracting a byte-aligned result shifted to 
+        the right by `c`.
+        "#,
+            &formats.ternary_imm8,
+        )
+        .operands_in(vec![x, y, c])
+        .operands_out(vec![a]),
+    );
+
    let i64_t = &TypeVar::new(
        "i64_t",
        "A scalar 64bit integer",
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -407,13 +407,18 @@ fn define_simd(
    let umax = insts.by_name("umax");
    let umin = insts.by_name("umin");
    let snarrow = insts.by_name("snarrow");
+    let swiden_high = insts.by_name("swiden_high");
+    let swiden_low = insts.by_name("swiden_low");
    let ushr_imm = insts.by_name("ushr_imm");
    let ushr = insts.by_name("ushr");
+    let uwiden_high = insts.by_name("uwiden_high");
+    let uwiden_low = insts.by_name("uwiden_low");
    let vconst = insts.by_name("vconst");
    let vall_true = insts.by_name("vall_true");
    let vany_true = insts.by_name("vany_true");
    let vselect = insts.by_name("vselect");

+    let x86_palignr = x86_instructions.by_name("x86_palignr");
    let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
    let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
    let x86_pmins = x86_instructions.by_name("x86_pmins");
@@ -786,6 +791,26 @@ fn define_simd(
        );
    }

+    // SIMD widen
+    for ty in &[I8, I16] {
+        let swiden_high = swiden_high.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(b = swiden_high(a)),
+            vec![
+                def!(c = x86_palignr(a, a, uimm8_eight)),
+                def!(b = swiden_low(c)),
+            ],
+        );
+        let uwiden_high = uwiden_high.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(b = uwiden_high(a)),
+            vec![
+                def!(c = x86_palignr(a, a, uimm8_eight)),
+                def!(b = uwiden_low(c)),
+            ],
+        );
+    }
+
    narrow.custom_legalize(shuffle, "convert_shuffle");
    narrow.custom_legalize(extractlane, "convert_extractlane");
    narrow.custom_legalize(insertlane, "convert_insertlane");
--- a/cranelift/codegen/meta/src/isa/x86/mod.rs
+++ b/cranelift/codegen/meta/src/isa/x86/mod.rs
@@ -14,7 +14,7 @@ mod legalize;
 mod opcodes;
 mod recipes;
 mod registers;
-mod settings;
+pub(crate) mod settings;

 pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
    let settings = settings::define(&shared_defs.settings);
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -354,6 +354,10 @@ pub static PADDUSB: [u8; 3] = [0x66, 0x0f, 0xdc];
 /// Add packed unsigned word integers from xmm2/m128 and xmm1 saturate the results (SSE).
 pub static PADDUSW: [u8; 3] = [0x66, 0x0f, 0xdd];

+/// Concatenate destination and source operands, extract a byte-aligned result into xmm1 that is
+/// shifted to the right by the constant number of bytes in imm8 (SSSE3).
+pub static PALIGNR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0f];
+
 /// Bitwise AND of xmm2/m128 and xmm1 (SSE2).
 pub static PAND: [u8; 3] = [0x66, 0x0f, 0xdb];

@@ -473,7 +477,7 @@ pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20];
 pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23];

 /// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
-/// integers in xmm1.
+/// integers in xmm1 (SSE4.1).
 pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25];

 /// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
@@ -485,7 +489,7 @@ pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30];
 pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33];

 /// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
-/// integers in xmm1.
+/// integers in xmm1 (SSE4.1).
 pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35];

 /// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of
--- a/cranelift/codegen/meta/src/isa/x86/settings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/settings.rs
@@ -3,12 +3,6 @@ use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder};
 pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
    let mut settings = SettingGroupBuilder::new("x86");

-    settings.add_bool(
-        "use_new_backend",
-        "Whether to use the new codegen backend using the new isel",
-        false,
-    );
-
    // CPUID.01H:ECX
    let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false);
    let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false);
--- a/cranelift/codegen/meta/src/lib.rs
+++ b/cranelift/codegen/meta/src/lib.rs
@@ -25,7 +25,11 @@ pub fn isa_from_arch(arch: &str) -> Result<isa::Isa, String> {
 }

 /// Generates all the Rust source files used in Cranelift from the meta-language.
-pub fn generate(isas: &[isa::Isa], out_dir: &str) -> Result<(), error::Error> {
+pub fn generate(
+    old_backend_isas: &[isa::Isa],
+    new_backend_isas: &[isa::Isa],
+    out_dir: &str,
+) -> Result<(), error::Error> {
    // Create all the definitions:
    // - common definitions.
    let mut shared_defs = shared::define();
@@ -39,7 +43,7 @@ pub fn generate(isas: &[isa::Isa], out_dir: &str) -> Result<(), error::Error> {
    gen_types::generate("types.rs", &out_dir)?;

    // - per ISA definitions.
-    let isas = isa::define(isas, &mut shared_defs);
+    let target_isas = isa::define(old_backend_isas, &mut shared_defs);

    // At this point, all definitions are done.
    let all_formats = shared_defs.verify_instruction_formats();
@@ -53,9 +57,22 @@ pub fn generate(isas: &[isa::Isa], out_dir: &str) -> Result<(), error::Error> {
        &out_dir,
    )?;

-    gen_legalizer::generate(&isas, &shared_defs.transform_groups, "legalize", &out_dir)?;
+    let extra_legalization_groups: &[&'static str] = if !new_backend_isas.is_empty() {
+        // The new backend only requires the "expand" legalization group.
+        &["expand"]
+    } else {
+        &[]
+    };

-    for isa in isas {
+    gen_legalizer::generate(
+        &target_isas,
+        &shared_defs.transform_groups,
+        extra_legalization_groups,
+        "legalize",
+        &out_dir,
+    )?;
+
+    for isa in target_isas {
        gen_registers::generate(&isa, &format!("registers-{}.rs", isa.name), &out_dir)?;

        gen_settings::generate(
@@ -80,5 +97,28 @@ pub fn generate(isas: &[isa::Isa], out_dir: &str) -> Result<(), error::Error> {
        )?;
    }

+    for isa in new_backend_isas {
+        match isa {
+            isa::Isa::X86 => {
+                // If the old backend ISAs contained x86, this file has already been generated.
+                if old_backend_isas.iter().any(|isa| *isa == isa::Isa::X86) {
+                    continue;
+                }
+
+                let settings = crate::isa::x86::settings::define(&shared_defs.settings);
+                gen_settings::generate(
+                    &settings,
+                    gen_settings::ParentGroup::Shared,
+                    "settings-x86.rs",
+                    &out_dir,
+                )?;
+            }
+            isa::Isa::Arm64 => {
+                // aarch64 doesn't have platform-specific settings.
+            }
+            isa::Isa::Arm32 | isa::Isa::Riscv => todo!(),
+        }
+    }
+
    Ok(())
 }
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3883,9 +3883,9 @@ pub(crate) fn define(
        .constraints(vec![WiderOrEq(Int.clone(), IntTo.clone())]),
    );

-    let I16xN = &TypeVar::new(
-        "I16xN",
-        "A SIMD vector type containing integers 16-bits wide and up",
+    let I16or32xN = &TypeVar::new(
+        "I16or32xN",
+        "A SIMD vector type containing integer lanes 16 or 32 bits wide",
        TypeSetBuilder::new()
            .ints(16..32)
            .simd_lanes(4..8)
@@ -3893,9 +3893,9 @@ pub(crate) fn define(
            .build(),
    );

-    let x = &Operand::new("x", I16xN);
-    let y = &Operand::new("y", I16xN);
-    let a = &Operand::new("a", &I16xN.split_lanes());
+    let x = &Operand::new("x", I16or32xN);
+    let y = &Operand::new("y", I16or32xN);
+    let a = &Operand::new("a", &I16or32xN.split_lanes());

    ig.push(
        Inst::new(
@@ -3934,6 +3934,75 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    let I8or16xN = &TypeVar::new(
+        "I8or16xN",
+        "A SIMD vector type containing integer lanes 8 or 16 bits wide.",
+        TypeSetBuilder::new()
+            .ints(8..16)
+            .simd_lanes(8..16)
+            .includes_scalars(false)
+            .build(),
+    );
+
+    let x = &Operand::new("x", I8or16xN);
+    let a = &Operand::new("a", &I8or16xN.merge_lanes());
+
+    ig.push(
+        Inst::new(
+            "swiden_low",
+            r#"
+        Widen the low lanes of `x` using signed extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "swiden_high",
+            r#"
+        Widen the high lanes of `x` using signed extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "uwiden_low",
+            r#"
+        Widen the low lanes of `x` using unsigned extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "uwiden_high",
+            r#"
+        Widen the high lanes of `x` using unsigned extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
    let IntTo = &TypeVar::new(
        "IntTo",
        "A larger integer type with the same number of lanes",
--- a/cranelift/codegen/src/ir/instructions.rs
+++ b/cranelift/codegen/src/ir/instructions.rs
@@ -584,6 +584,9 @@ enum OperandConstraint {

    /// This operand is `ctrlType.split_lanes()`.
    SplitLanes,
+
+    /// This operand is `ctrlType.merge_lanes()`.
+    MergeLanes,
 }

 impl OperandConstraint {
@@ -615,6 +618,11 @@ impl OperandConstraint {
                    .split_lanes()
                    .expect("invalid type for split_lanes"),
            ),
+            MergeLanes => Bound(
+                ctrl_type
+                    .merge_lanes()
+                    .expect("invalid type for merge_lanes"),
+            ),
        }
    }
 }
--- a/cranelift/codegen/src/ir/types.rs
+++ b/cranelift/codegen/src/ir/types.rs
@@ -284,7 +284,7 @@ impl Type {

    /// Split the lane width in half and double the number of lanes to maintain the same bit-width.
    ///
-    /// If this is a scalar type of n bits, it produces a SIMD vector type of (n/2)x2.
+    /// If this is a scalar type of `n` bits, it produces a SIMD vector type of `(n/2)x2`.
    pub fn split_lanes(self) -> Option<Self> {
        match self.half_width() {
            Some(half_width) => half_width.by(2),
@@ -292,6 +292,17 @@ impl Type {
        }
    }

+    /// Merge lanes to half the number of lanes and double the lane width to maintain the same
+    /// bit-width.
+    ///
+    /// If this is a scalar type, it will return `None`.
+    pub fn merge_lanes(self) -> Option<Self> {
+        match self.double_width() {
+            Some(double_width) => double_width.half_vector(),
+            None => None,
+        }
+    }
+
    /// Index of this type, for use with hash tables etc.
    pub fn index(self) -> usize {
        usize::from(self.0)
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,6 +3,7 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]

+use crate::ir::types::{F32X2, F32X4, F64X2, I16X4, I16X8, I32X2, I32X4, I64X2, I8X16, I8X8};
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::lower::ty_bits;
@@ -587,3 +588,55 @@ impl ScalarSize {
        }
    }
 }
+
+/// Type used to communicate the size of a vector operand.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum VectorSize {
+    Size8x8,
+    Size8x16,
+    Size16x4,
+    Size16x8,
+    Size32x2,
+    Size32x4,
+    Size64x2,
+}
+
+impl VectorSize {
+    /// Convert from a type into a vector operand size.
+    pub fn from_ty(ty: Type) -> VectorSize {
+        match ty {
+            F32X2 => VectorSize::Size32x2,
+            F32X4 => VectorSize::Size32x4,
+            F64X2 => VectorSize::Size64x2,
+            I8X8 => VectorSize::Size8x8,
+            I8X16 => VectorSize::Size8x16,
+            I16X4 => VectorSize::Size16x4,
+            I16X8 => VectorSize::Size16x8,
+            I32X2 => VectorSize::Size32x2,
+            I32X4 => VectorSize::Size32x4,
+            I64X2 => VectorSize::Size64x2,
+            _ => unimplemented!(),
+        }
+    }
+
+    /// Get the integer operand size that corresponds to a lane of a vector with a certain size.
+    pub fn operand_size(&self) -> OperandSize {
+        match self {
+            VectorSize::Size64x2 => OperandSize::Size64,
+            _ => OperandSize::Size32,
+        }
+    }
+
+    /// Get the scalar operand size that corresponds to a lane of a vector with a certain size.
+    pub fn lane_size(&self) -> ScalarSize {
+        match self {
+            VectorSize::Size8x8 => ScalarSize::Size8,
+            VectorSize::Size8x16 => ScalarSize::Size8,
+            VectorSize::Size16x4 => ScalarSize::Size16,
+            VectorSize::Size16x8 => ScalarSize::Size16,
+            VectorSize::Size32x2 => ScalarSize::Size32,
+            VectorSize::Size32x4 => ScalarSize::Size32,
+            VectorSize::Size64x2 => ScalarSize::Size64,
+        }
+    }
+}
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1007,7 +1007,7 @@ impl MachInstEmit for Inst {
                sink.put4(enc_vecmov(/* 16b = */ true, rd, rn));
            }
            &Inst::FpuMoveFromVec { rd, rn, idx, size } => {
-                let (imm5, shift, mask) = match size {
+                let (imm5, shift, mask) = match size.lane_size() {
                    ScalarSize::Size32 => (0b00100, 3, 0b011),
                    ScalarSize::Size64 => (0b01000, 4, 0b001),
                    _ => unimplemented!(),
@@ -1048,6 +1048,10 @@ impl MachInstEmit for Inst {
                    FPUOp2::Max64 => 0b000_11110_01_1_00000_010010,
                    FPUOp2::Min32 => 0b000_11110_00_1_00000_010110,
                    FPUOp2::Min64 => 0b000_11110_01_1_00000_010110,
+                    FPUOp2::Sqadd64 => 0b010_11110_11_1_00000_000011,
+                    FPUOp2::Uqadd64 => 0b011_11110_11_1_00000_000011,
+                    FPUOp2::Sqsub64 => 0b010_11110_11_1_00000_001011,
+                    FPUOp2::Uqsub64 => 0b011_11110_11_1_00000_001011,
                };
                sink.put4(enc_fpurrr(top22, rd, rn, rm));
            }
@@ -1102,31 +1106,25 @@ impl MachInstEmit for Inst {
                };
                sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
            }
-            &Inst::VecMisc { op, rd, rn, ty } => {
-                let enc_size = match ty {
-                    I8X16 => 0b00,
-                    I16X8 => 0b01,
-                    I32X4 => 0b10,
-                    I64X2 => 0b11,
-                    _ => 0,
+            &Inst::VecMisc { op, rd, rn, size } => {
+                let enc_size = match size {
+                    VectorSize::Size8x16 => 0b00,
+                    VectorSize::Size16x8 => 0b01,
+                    VectorSize::Size32x4 => 0b10,
+                    VectorSize::Size64x2 => 0b11,
+                    _ => unimplemented!(),
                };
                let (bits_12_16, size) = match op {
-                    VecMisc2::Not => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b00101, 0b00)
-                    }
-                    VecMisc2::Neg => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b01011, enc_size)
-                    }
+                    VecMisc2::Not => (0b00101, 0b00),
+                    VecMisc2::Neg => (0b01011, enc_size),
                };
                sink.put4(enc_vec_rr_misc(size, bits_12_16, rd, rn));
            }
-            &Inst::VecLanes { op, rd, rn, ty } => {
-                let (q, size) = match ty {
-                    I8X16 => (0b1, 0b00),
-                    I16X8 => (0b1, 0b01),
-                    I32X4 => (0b1, 0b10),
+            &Inst::VecLanes { op, rd, rn, size } => {
+                let (q, size) = match size {
+                    VectorSize::Size8x16 => (0b1, 0b00),
+                    VectorSize::Size16x8 => (0b1, 0b01),
+                    VectorSize::Size32x4 => (0b1, 0b10),
                    _ => unreachable!(),
                };
                let (u, opcode) = match op {
@@ -1250,12 +1248,12 @@ impl MachInstEmit for Inst {
                        | machreg_to_vec(rd.to_reg()),
                );
            }
-            &Inst::MovFromVec { rd, rn, idx, ty } => {
-                let (q, imm5, shift, mask) = match ty {
-                    I8 => (0b0, 0b00001, 1, 0b1111),
-                    I16 => (0b0, 0b00010, 2, 0b0111),
-                    I32 => (0b0, 0b00100, 3, 0b0011),
-                    I64 => (0b1, 0b01000, 4, 0b0001),
+            &Inst::MovFromVec { rd, rn, idx, size } => {
+                let (q, imm5, shift, mask) = match size {
+                    VectorSize::Size8x16 => (0b0, 0b00001, 1, 0b1111),
+                    VectorSize::Size16x8 => (0b0, 0b00010, 2, 0b0111),
+                    VectorSize::Size32x4 => (0b0, 0b00100, 3, 0b0011),
+                    VectorSize::Size64x2 => (0b1, 0b01000, 4, 0b0001),
                    _ => unreachable!(),
                };
                debug_assert_eq!(idx & mask, idx);
@@ -1268,12 +1266,12 @@ impl MachInstEmit for Inst {
                        | machreg_to_gpr(rd.to_reg()),
                );
            }
-            &Inst::VecDup { rd, rn, ty } => {
-                let imm5 = match ty {
-                    I8 => 0b00001,
-                    I16 => 0b00010,
-                    I32 => 0b00100,
-                    I64 => 0b01000,
+            &Inst::VecDup { rd, rn, size } => {
+                let imm5 = match size {
+                    VectorSize::Size8x16 => 0b00001,
+                    VectorSize::Size16x8 => 0b00010,
+                    VectorSize::Size32x4 => 0b00100,
+                    VectorSize::Size64x2 => 0b01000,
                    _ => unimplemented!(),
                };
                sink.put4(
@@ -1283,10 +1281,10 @@ impl MachInstEmit for Inst {
                        | machreg_to_vec(rd.to_reg()),
                );
            }
-            &Inst::VecDupFromFpu { rd, rn, ty } => {
-                let imm5 = match ty {
-                    F32 => 0b00100,
-                    F64 => 0b01000,
+            &Inst::VecDupFromFpu { rd, rn, size } => {
+                let imm5 = match size {
+                    VectorSize::Size32x4 => 0b00100,
+                    VectorSize::Size64x2 => 0b01000,
                    _ => unimplemented!(),
                };
                sink.put4(
@@ -1318,41 +1316,25 @@ impl MachInstEmit for Inst {
                rn,
                rm,
                alu_op,
-                ty,
+                size,
            } => {
-                let enc_size = match ty {
-                    I8X16 => 0b00,
-                    I16X8 => 0b01,
-                    I32X4 => 0b10,
-                    I64X2 => 0b11,
+                let enc_size = match size {
+                    VectorSize::Size8x16 => 0b00,
+                    VectorSize::Size16x8 => 0b01,
+                    VectorSize::Size32x4 => 0b10,
+                    VectorSize::Size64x2 => 0b11,
                    _ => 0,
                };
-                let enc_size_for_fcmp = match ty {
-                    F32X4 => 0b0,
-                    F64X2 => 0b1,
+                let enc_size_for_fcmp = match size {
+                    VectorSize::Size32x4 => 0b0,
+                    VectorSize::Size64x2 => 0b1,
                    _ => 0,
                };

                let (top11, bit15_10) = match alu_op {
-                    VecALUOp::SQAddScalar => {
-                        debug_assert_eq!(I64, ty);
-                        (0b010_11110_11_1, 0b000011)
-                    }
                    VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::SQSubScalar => {
-                        debug_assert_eq!(I64, ty);
-                        (0b010_11110_11_1, 0b001011)
-                    }
                    VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
-                    VecALUOp::UQAddScalar => {
-                        debug_assert_eq!(I64, ty);
-                        (0b011_11110_11_1, 0b000011)
-                    }
                    VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::UQSubScalar => {
-                        debug_assert_eq!(I64, ty);
-                        (0b011_11110_11_1, 0b001011)
-                    }
                    VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
                    VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
                    VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
@@ -1364,31 +1346,16 @@ impl MachInstEmit for Inst {
                    VecALUOp::Fcmge => (0b011_01110_00_1 | enc_size_for_fcmp << 1, 0b111001),
                    // The following logical instructions operate on bytes, so are not encoded differently
                    // for the different vector types.
-                    VecALUOp::And => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b010_01110_00_1, 0b000111)
-                    }
-                    VecALUOp::Bic => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b010_01110_01_1, 0b000111)
-                    }
-                    VecALUOp::Orr => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b010_01110_10_1, 0b000111)
-                    }
-                    VecALUOp::Eor => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b011_01110_00_1, 0b000111)
-                    }
-                    VecALUOp::Bsl => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b011_01110_01_1, 0b000111)
-                    }
+                    VecALUOp::And => (0b010_01110_00_1, 0b000111),
+                    VecALUOp::Bic => (0b010_01110_01_1, 0b000111),
+                    VecALUOp::Orr => (0b010_01110_10_1, 0b000111),
+                    VecALUOp::Eor => (0b011_01110_00_1, 0b000111),
+                    VecALUOp::Bsl => (0b011_01110_01_1, 0b000111),
                    VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
                    VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
                    VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
                    VecALUOp::Mul => {
-                        debug_assert_ne!(I64X2, ty);
+                        debug_assert_ne!(size, VectorSize::Size64x2);
                        (0b010_01110_00_1 | enc_size << 1, 0b100111)
                    }
                    VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001),
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1841,7 +1841,7 @@ fn test_aarch64_binemit() {
            rd: writable_xreg(3),
            rn: vreg(27),
            idx: 14,
-            ty: I8,
+            size: VectorSize::Size8x16,
        },
        "633F1D0E",
        "umov w3, v27.b[14]",
@@ -1851,7 +1851,7 @@ fn test_aarch64_binemit() {
            rd: writable_xreg(24),
            rn: vreg(5),
            idx: 3,
-            ty: I16,
+            size: VectorSize::Size16x8,
        },
        "B83C0E0E",
        "umov w24, v5.h[3]",
@@ -1861,7 +1861,7 @@ fn test_aarch64_binemit() {
            rd: writable_xreg(12),
            rn: vreg(17),
            idx: 1,
-            ty: I32,
+            size: VectorSize::Size32x4,
        },
        "2C3E0C0E",
        "mov w12, v17.s[1]",
@@ -1871,7 +1871,7 @@ fn test_aarch64_binemit() {
            rd: writable_xreg(21),
            rn: vreg(20),
            idx: 0,
-            ty: I64,
+            size: VectorSize::Size64x2,
        },
        "953E084E",
        "mov x21, v20.d[0]",
@@ -1900,7 +1900,7 @@ fn test_aarch64_binemit() {
        Inst::VecDup {
            rd: writable_vreg(25),
            rn: xreg(7),
-            ty: I8,
+            size: VectorSize::Size8x16,
        },
        "F90C014E",
        "dup v25.16b, w7",
@@ -1909,7 +1909,7 @@ fn test_aarch64_binemit() {
        Inst::VecDup {
            rd: writable_vreg(2),
            rn: xreg(23),
-            ty: I16,
+            size: VectorSize::Size16x8,
        },
        "E20E024E",
        "dup v2.8h, w23",
@@ -1918,7 +1918,7 @@ fn test_aarch64_binemit() {
        Inst::VecDup {
            rd: writable_vreg(0),
            rn: xreg(28),
-            ty: I32,
+            size: VectorSize::Size32x4,
        },
        "800F044E",
        "dup v0.4s, w28",
@@ -1927,7 +1927,7 @@ fn test_aarch64_binemit() {
        Inst::VecDup {
            rd: writable_vreg(31),
            rn: xreg(5),
-            ty: I64,
+            size: VectorSize::Size64x2,
        },
        "BF0C084E",
        "dup v31.2d, x5",
@@ -1936,7 +1936,7 @@ fn test_aarch64_binemit() {
        Inst::VecDupFromFpu {
            rd: writable_vreg(14),
            rn: vreg(19),
-            ty: F32,
+            size: VectorSize::Size32x4,
        },
        "6E06044E",
        "dup v14.4s, v19.s[0]",
@@ -1945,7 +1945,7 @@ fn test_aarch64_binemit() {
        Inst::VecDupFromFpu {
            rd: writable_vreg(18),
            rn: vreg(10),
-            ty: F64,
+            size: VectorSize::Size64x2,
        },
        "5205084E",
        "dup v18.2d, v10.d[0]",
@@ -2004,50 +2004,6 @@ fn test_aarch64_binemit() {
        "5CA4202F",
        "uxtl v28.2d, v2.2s",
    ));
-    insns.push((
-        Inst::VecRRR {
-            rd: writable_vreg(21),
-            rn: vreg(22),
-            rm: vreg(23),
-            alu_op: VecALUOp::UQAddScalar,
-            ty: I64,
-        },
-        "D50EF77E",
-        "uqadd d21, d22, d23",
-    ));
-    insns.push((
-        Inst::VecRRR {
-            rd: writable_vreg(21),
-            rn: vreg(22),
-            rm: vreg(23),
-            alu_op: VecALUOp::SQAddScalar,
-            ty: I64,
-        },
-        "D50EF75E",
-        "sqadd d21, d22, d23",
-    ));
-    insns.push((
-        Inst::VecRRR {
-            rd: writable_vreg(21),
-            rn: vreg(22),
-            rm: vreg(23),
-            alu_op: VecALUOp::UQSubScalar,
-            ty: I64,
-        },
-        "D52EF77E",
-        "uqsub d21, d22, d23",
-    ));
-    insns.push((
-        Inst::VecRRR {
-            rd: writable_vreg(21),
-            rn: vreg(22),
-            rm: vreg(23),
-            alu_op: VecALUOp::SQSubScalar,
-            ty: I64,
-        },
-        "D52EF75E",
-        "sqsub d21, d22, d23",
-    ));

    insns.push((
        Inst::VecRRR {
@@ -2055,7 +2011,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(2),
            rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "410C284E",
        "sqadd v1.16b, v2.16b, v8.16b",
@@ -2067,7 +2023,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(12),
            rm: vreg(28),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "810D7C4E",
        "sqadd v1.8h, v12.8h, v28.8h",
@@ -2079,7 +2035,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(12),
            rn: vreg(2),
            rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "4C0CA64E",
        "sqadd v12.4s, v2.4s, v6.4s",
@@ -2091,7 +2047,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(20),
            rn: vreg(7),
            rm: vreg(13),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
        },
        "F40CED4E",
        "sqadd v20.2d, v7.2d, v13.2d",
@@ -2103,7 +2059,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(2),
            rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "412C284E",
        "sqsub v1.16b, v2.16b, v8.16b",
@@ -2115,7 +2071,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(12),
            rm: vreg(28),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "812D7C4E",
        "sqsub v1.8h, v12.8h, v28.8h",
@@ -2127,7 +2083,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(12),
            rn: vreg(2),
            rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "4C2CA64E",
        "sqsub v12.4s, v2.4s, v6.4s",
@@ -2139,7 +2095,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(20),
            rn: vreg(7),
            rm: vreg(13),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
        },
        "F42CED4E",
        "sqsub v20.2d, v7.2d, v13.2d",
@@ -2151,7 +2107,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(2),
            rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "410C286E",
        "uqadd v1.16b, v2.16b, v8.16b",
@@ -2163,7 +2119,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(12),
            rm: vreg(28),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "810D7C6E",
        "uqadd v1.8h, v12.8h, v28.8h",
@@ -2175,7 +2131,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(12),
            rn: vreg(2),
            rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "4C0CA66E",
        "uqadd v12.4s, v2.4s, v6.4s",
@@ -2187,7 +2143,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(20),
            rn: vreg(7),
            rm: vreg(13),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
        },
        "F40CED6E",
        "uqadd v20.2d, v7.2d, v13.2d",
@@ -2199,7 +2155,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(2),
            rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "412C286E",
        "uqsub v1.16b, v2.16b, v8.16b",
@@ -2211,7 +2167,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(12),
            rm: vreg(28),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "812D7C6E",
        "uqsub v1.8h, v12.8h, v28.8h",
@@ -2223,7 +2179,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(12),
            rn: vreg(2),
            rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "4C2CA66E",
        "uqsub v12.4s, v2.4s, v6.4s",
@@ -2235,7 +2191,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(20),
            rn: vreg(7),
            rm: vreg(13),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
        },
        "F42CED6E",
        "uqsub v20.2d, v7.2d, v13.2d",
@@ -2247,7 +2203,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(3),
            rn: vreg(23),
            rm: vreg(24),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "E38E386E",
        "cmeq v3.16b, v23.16b, v24.16b",
@@ -2259,7 +2215,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(3),
            rn: vreg(23),
            rm: vreg(24),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "E336384E",
        "cmgt v3.16b, v23.16b, v24.16b",
@@ -2271,7 +2227,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(23),
            rn: vreg(9),
            rm: vreg(12),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "373D2C4E",
        "cmge v23.16b, v9.16b, v12.16b",
@@ -2283,7 +2239,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(5),
            rn: vreg(1),
            rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "2534216E",
        "cmhi v5.16b, v1.16b, v1.16b",
@@ -2295,7 +2251,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(8),
            rn: vreg(2),
            rm: vreg(15),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "483C2F6E",
        "cmhs v8.16b, v2.16b, v15.16b",
@@ -2307,7 +2263,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(3),
            rn: vreg(23),
            rm: vreg(24),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "E38E786E",
        "cmeq v3.8h, v23.8h, v24.8h",
@@ -2319,7 +2275,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(3),
            rn: vreg(23),
            rm: vreg(24),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "E336784E",
        "cmgt v3.8h, v23.8h, v24.8h",
@@ -2331,7 +2287,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(23),
            rn: vreg(9),
            rm: vreg(12),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "373D6C4E",
        "cmge v23.8h, v9.8h, v12.8h",
@@ -2343,7 +2299,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(5),
            rn: vreg(1),
            rm: vreg(1),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "2534616E",
        "cmhi v5.8h, v1.8h, v1.8h",
@@ -2355,7 +2311,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(8),
            rn: vreg(2),
            rm: vreg(15),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "483C6F6E",
        "cmhs v8.8h, v2.8h, v15.8h",
@@ -2367,7 +2323,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(3),
            rn: vreg(23),
            rm: vreg(24),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "E38EB86E",
        "cmeq v3.4s, v23.4s, v24.4s",
@@ -2379,7 +2335,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(3),
            rn: vreg(23),
            rm: vreg(24),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "E336B84E",
        "cmgt v3.4s, v23.4s, v24.4s",
@@ -2391,7 +2347,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(23),
            rn: vreg(9),
            rm: vreg(12),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "373DAC4E",
        "cmge v23.4s, v9.4s, v12.4s",
@@ -2403,7 +2359,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(5),
            rn: vreg(1),
            rm: vreg(1),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "2534A16E",
        "cmhi v5.4s, v1.4s, v1.4s",
@@ -2415,7 +2371,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(8),
            rn: vreg(2),
            rm: vreg(15),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "483CAF6E",
        "cmhs v8.4s, v2.4s, v15.4s",
@@ -2427,7 +2383,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(28),
            rn: vreg(12),
            rm: vreg(4),
-            ty: F32X4,
+            size: VectorSize::Size32x4,
        },
        "9CE5244E",
        "fcmeq v28.4s, v12.4s, v4.4s",
@@ -2439,7 +2395,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(3),
            rn: vreg(16),
            rm: vreg(31),
-            ty: F64X2,
+            size: VectorSize::Size64x2,
        },
        "03E6FF6E",
        "fcmgt v3.2d, v16.2d, v31.2d",
@@ -2451,7 +2407,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(23),
            rm: vreg(0),
-            ty: F64X2,
+            size: VectorSize::Size64x2,
        },
        "F2E6606E",
        "fcmge v18.2d, v23.2d, v0.2d",
@@ -2463,7 +2419,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(20),
            rn: vreg(19),
            rm: vreg(18),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "741E324E",
        "and v20.16b, v19.16b, v18.16b",
@@ -2475,7 +2431,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(8),
            rn: vreg(11),
            rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "681D614E",
        "bic v8.16b, v11.16b, v1.16b",
@@ -2487,7 +2443,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(15),
            rn: vreg(2),
            rm: vreg(12),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "4F1CAC4E",
        "orr v15.16b, v2.16b, v12.16b",
@@ -2499,7 +2455,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(3),
            rm: vreg(22),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "721C366E",
        "eor v18.16b, v3.16b, v22.16b",
@@ -2511,7 +2467,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(8),
            rn: vreg(9),
            rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "281D616E",
        "bsl v8.16b, v9.16b, v1.16b",
@@ -2523,7 +2479,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(8),
            rn: vreg(12),
            rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "88A5216E",
        "umaxp v8.16b, v12.16b, v1.16b",
@@ -2535,7 +2491,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(6),
            rm: vreg(1),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "C1A4616E",
        "umaxp v1.8h, v6.8h, v1.8h",
@@ -2547,7 +2503,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(20),
            rm: vreg(16),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "81A6B06E",
        "umaxp v1.4s, v20.4s, v16.4s",
@@ -2559,7 +2515,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(5),
            rn: vreg(1),
            rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "2584214E",
        "add v5.16b, v1.16b, v1.16b",
@@ -2571,7 +2527,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(7),
            rn: vreg(13),
            rm: vreg(2),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "A785624E",
        "add v7.8h, v13.8h, v2.8h",
@@ -2583,7 +2539,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(9),
            rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "3285A64E",
        "add v18.4s, v9.4s, v6.4s",
@@ -2595,7 +2551,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(3),
            rm: vreg(2),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
        },
        "6184E24E",
        "add v1.2d, v3.2d, v2.2d",
@@ -2607,7 +2563,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(5),
            rn: vreg(1),
            rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "2584216E",
        "sub v5.16b, v1.16b, v1.16b",
@@ -2619,7 +2575,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(7),
            rn: vreg(13),
            rm: vreg(2),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "A785626E",
        "sub v7.8h, v13.8h, v2.8h",
@@ -2631,7 +2587,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(9),
            rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "3285A66E",
        "sub v18.4s, v9.4s, v6.4s",
@@ -2643,7 +2599,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(0),
            rm: vreg(8),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
        },
        "1284E86E",
        "sub v18.2d, v0.2d, v8.2d",
@@ -2655,7 +2611,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(25),
            rn: vreg(9),
            rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "399D284E",
        "mul v25.16b, v9.16b, v8.16b",
@@ -2667,7 +2623,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(30),
            rn: vreg(30),
            rm: vreg(12),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "DE9F6C4E",
        "mul v30.8h, v30.8h, v12.8h",
@@ -2679,7 +2635,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(18),
            rm: vreg(18),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "529EB24E",
        "mul v18.4s, v18.4s, v18.4s",
@@ -2691,7 +2647,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(18),
            rm: vreg(18),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "5246326E",
        "ushl v18.16b, v18.16b, v18.16b",
@@ -2703,7 +2659,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(18),
            rm: vreg(18),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "5246726E",
        "ushl v18.8h, v18.8h, v18.8h",
@@ -2715,7 +2671,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(1),
            rm: vreg(21),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "3244B56E",
        "ushl v18.4s, v1.4s, v21.4s",
@@ -2727,7 +2683,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(5),
            rn: vreg(7),
            rm: vreg(19),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
        },
        "E544F36E",
        "ushl v5.2d, v7.2d, v19.2d",
@@ -2739,7 +2695,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(18),
            rn: vreg(18),
            rm: vreg(18),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "5246324E",
        "sshl v18.16b, v18.16b, v18.16b",
@@ -2751,7 +2707,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(30),
            rn: vreg(1),
            rm: vreg(29),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "3E447D4E",
        "sshl v30.8h, v1.8h, v29.8h",
@@ -2763,7 +2719,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(8),
            rn: vreg(22),
            rm: vreg(21),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "C846B54E",
        "sshl v8.4s, v22.4s, v21.4s",
@@ -2775,7 +2731,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(8),
            rn: vreg(22),
            rm: vreg(2),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
        },
        "C846E24E",
        "sshl v8.2d, v22.2d, v2.2d",
@@ -2786,7 +2742,7 @@ fn test_aarch64_binemit() {
            op: VecMisc2::Not,
            rd: writable_vreg(2),
            rn: vreg(1),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "2258206E",
        "mvn v2.16b, v1.16b",
@@ -2797,7 +2753,7 @@ fn test_aarch64_binemit() {
            op: VecMisc2::Neg,
            rd: writable_vreg(8),
            rn: vreg(12),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "88B9206E",
        "neg v8.16b, v12.16b",
@@ -2808,7 +2764,7 @@ fn test_aarch64_binemit() {
            op: VecMisc2::Neg,
            rd: writable_vreg(0),
            rn: vreg(31),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "E0BB606E",
        "neg v0.8h, v31.8h",
@@ -2819,7 +2775,7 @@ fn test_aarch64_binemit() {
            op: VecMisc2::Neg,
            rd: writable_vreg(2),
            rn: vreg(3),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "62B8A06E",
        "neg v2.4s, v3.4s",
@@ -2830,7 +2786,7 @@ fn test_aarch64_binemit() {
            op: VecMisc2::Neg,
            rd: writable_vreg(10),
            rn: vreg(8),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
        },
        "0AB9E06E",
        "neg v10.2d, v8.2d",
@@ -2841,7 +2797,7 @@ fn test_aarch64_binemit() {
            op: VecLanesOp::Uminv,
            rd: writable_vreg(2),
            rn: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
        },
        "22A8316E",
        "uminv b2, v1.16b",
@@ -2852,7 +2808,7 @@ fn test_aarch64_binemit() {
            op: VecLanesOp::Uminv,
            rd: writable_vreg(3),
            rn: vreg(11),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
        },
        "63A9716E",
        "uminv h3, v11.8h",
@@ -2863,7 +2819,7 @@ fn test_aarch64_binemit() {
            op: VecLanesOp::Uminv,
            rd: writable_vreg(18),
            rn: vreg(4),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
        },
        "92A8B16E",
        "uminv s18, v4.4s",
@@ -3214,7 +3170,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(1),
            rn: vreg(30),
            idx: 2,
-            size: ScalarSize::Size32,
+            size: VectorSize::Size32x4,
        },
        "C107145E",
        "mov s1, v30.s[2]",
@@ -3225,7 +3181,7 @@ fn test_aarch64_binemit() {
            rd: writable_vreg(23),
            rn: vreg(11),
            idx: 0,
-            size: ScalarSize::Size64,
+            size: VectorSize::Size64x2,
        },
        "7705085E",
        "mov d23, v11.d[0]",
@@ -3443,6 +3399,50 @@ fn test_aarch64_binemit() {
        "fmin d15, d30, d31",
    ));

+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Uqadd64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D50EF77E",
+        "uqadd d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Sqadd64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D50EF75E",
+        "sqadd d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Uqsub64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D52EF77E",
+        "uqsub d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Sqsub64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D52EF75E",
+        "sqsub d21, d22, d23",
+    ));
+
    insns.push((
        Inst::FpuRRRR {
            fpu_op: FPUOp3::MAdd32,
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -5,8 +5,8 @@

 use crate::binemit::CodeOffset;
 use crate::ir::types::{
-    B1, B16, B16X8, B32, B32X4, B64, B64X2, B8, B8X16, F32, F32X2, F32X4, F64, F64X2, FFLAGS, I16,
-    I16X4, I16X8, I32, I32X2, I32X4, I64, I64X2, I8, I8X16, I8X8, IFLAGS, R32, R64,
+    B1, B16, B16X8, B32, B32X4, B64, B64X2, B8, B8X16, F32, F32X4, F64, F64X2, FFLAGS, I16, I16X8,
+    I32, I32X4, I64, I64X2, I8, I8X16, IFLAGS, R32, R64,
 };
 use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
 use crate::machinst::*;
@@ -125,6 +125,14 @@ pub enum FPUOp2 {
    Max64,
    Min32,
    Min64,
+    /// Signed saturating add
+    Sqadd64,
+    /// Unsigned saturating add
+    Uqadd64,
+    /// Signed saturating subtract
+    Sqsub64,
+    /// Unsigned saturating subtract
+    Uqsub64,
 }

 /// A floating-point unit (FPU) operation with two args, a register and an immediate.
@@ -208,16 +216,12 @@ pub enum VecExtendOp {
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum VecALUOp {
    /// Signed saturating add
-    SQAddScalar,
    Sqadd,
    /// Unsigned saturating add
-    UQAddScalar,
    Uqadd,
    /// Signed saturating subtract
-    SQSubScalar,
    Sqsub,
    /// Unsigned saturating subtract
-    UQSubScalar,
    Uqsub,
    /// Compare bitwise equal
    Cmeq,
@@ -590,7 +594,7 @@ pub enum Inst {
        rd: Writable<Reg>,
        rn: Reg,
        idx: u8,
-        size: ScalarSize,
+        size: VectorSize,
    },

    /// 1-op FPU instruction.
@@ -734,21 +738,21 @@ pub enum Inst {
        rd: Writable<Reg>,
        rn: Reg,
        idx: u8,
-        ty: Type,
+        size: VectorSize,
    },

    /// Duplicate general-purpose register to vector.
    VecDup {
        rd: Writable<Reg>,
        rn: Reg,
-        ty: Type,
+        size: VectorSize,
    },

    /// Duplicate scalar to vector.
    VecDupFromFpu {
        rd: Writable<Reg>,
        rn: Reg,
-        ty: Type,
+        size: VectorSize,
    },

    /// Vector extend.
@@ -764,7 +768,7 @@ pub enum Inst {
        rd: Writable<Reg>,
        rn: Reg,
        rm: Reg,
-        ty: Type,
+        size: VectorSize,
    },

    /// Vector two register miscellaneous instruction.
@@ -772,7 +776,7 @@ pub enum Inst {
        op: VecMisc2,
        rd: Writable<Reg>,
        rn: Reg,
-        ty: Type,
+        size: VectorSize,
    },

    /// Vector instruction across lanes.
@@ -780,7 +784,7 @@ pub enum Inst {
        op: VecLanesOp,
        rd: Writable<Reg>,
        rn: Reg,
-        ty: Type,
+        size: VectorSize,
    },

    /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
@@ -2504,13 +2508,8 @@ impl Inst {
                format!("mov {}.16b, {}.16b", rd, rn)
            }
            &Inst::FpuMoveFromVec { rd, rn, idx, size } => {
-                let vector_type = match size {
-                    ScalarSize::Size32 => F32,
-                    ScalarSize::Size64 => F64,
-                    _ => unimplemented!(),
-                };
-                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
-                let rn = show_vreg_element(rn, mb_rru, idx, vector_type);
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size.lane_size());
+                let rn = show_vreg_element(rn, mb_rru, idx, size);
                format!("mov {}, {}", rd, rn)
            }
            &Inst::FpuRR { fpu_op, rd, rn } => {
@@ -2542,6 +2541,10 @@ impl Inst {
                    FPUOp2::Max64 => ("fmax", ScalarSize::Size64),
                    FPUOp2::Min32 => ("fmin", ScalarSize::Size32),
                    FPUOp2::Min64 => ("fmin", ScalarSize::Size64),
+                    FPUOp2::Sqadd64 => ("sqadd", ScalarSize::Size64),
+                    FPUOp2::Uqadd64 => ("uqadd", ScalarSize::Size64),
+                    FPUOp2::Sqsub64 => ("sqsub", ScalarSize::Size64),
+                    FPUOp2::Uqsub64 => ("uqsub", ScalarSize::Size64),
                };
                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
                let rn = show_vreg_scalar(rn, mb_rru, size);
@@ -2557,7 +2560,7 @@ impl Inst {
                };

                let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector {
-                    |reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2)
+                    |reg, mb_rru| show_vreg_vector(reg, mb_rru, VectorSize::Size32x2)
                } else {
                    |reg, mb_rru| show_vreg_scalar(reg, mb_rru, ScalarSize::Size64)
                };
@@ -2706,45 +2709,36 @@ impl Inst {
                let rn = rn.show_rru(mb_rru);
                format!("mov {}.d[0], {}", rd, rn)
            }
-            &Inst::MovFromVec { rd, rn, idx, ty } => {
-                let op = match ty {
-                    I32 | I64 => "mov",
-                    _ => "umov",
+            &Inst::MovFromVec { rd, rn, idx, size } => {
+                let op = match size {
+                    VectorSize::Size8x16 => "umov",
+                    VectorSize::Size16x8 => "umov",
+                    VectorSize::Size32x4 => "mov",
+                    VectorSize::Size64x2 => "mov",
+                    _ => unimplemented!(),
                };
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::from_ty(ty));
-                let rn = show_vreg_element(rn, mb_rru, idx, ty);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size.operand_size());
+                let rn = show_vreg_element(rn, mb_rru, idx, size);
                format!("{} {}, {}", op, rd, rn)
            }
-            &Inst::VecDup { rd, rn, ty } => {
-                let vector_type = match ty {
-                    I8 => I8X16,
-                    I16 => I16X8,
-                    I32 => I32X4,
-                    I64 => I64X2,
-                    _ => unimplemented!(),
-                };
-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, vector_type);
-                let rn = show_ireg_sized(rn, mb_rru, OperandSize::from_ty(ty));
+            &Inst::VecDup { rd, rn, size } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size.operand_size());
                format!("dup {}, {}", rd, rn)
            }
-            &Inst::VecDupFromFpu { rd, rn, ty } => {
-                let vector_type = match ty {
-                    F32 => F32X4,
-                    F64 => F64X2,
-                    _ => unimplemented!(),
-                };
-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, vector_type);
-                let rn = show_vreg_element(rn, mb_rru, 0, ty);
+            &Inst::VecDupFromFpu { rd, rn, size } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_element(rn, mb_rru, 0, size);
                format!("dup {}, {}", rd, rn)
            }
            &Inst::VecExtend { t, rd, rn } => {
                let (op, dest, src) = match t {
-                    VecExtendOp::Sxtl8 => ("sxtl", I16X8, I8X8),
-                    VecExtendOp::Sxtl16 => ("sxtl", I32X4, I16X4),
-                    VecExtendOp::Sxtl32 => ("sxtl", I64X2, I32X2),
-                    VecExtendOp::Uxtl8 => ("uxtl", I16X8, I8X8),
-                    VecExtendOp::Uxtl16 => ("uxtl", I32X4, I16X4),
-                    VecExtendOp::Uxtl32 => ("uxtl", I64X2, I32X2),
+                    VecExtendOp::Sxtl8 => ("sxtl", VectorSize::Size16x8, VectorSize::Size8x8),
+                    VecExtendOp::Sxtl16 => ("sxtl", VectorSize::Size32x4, VectorSize::Size16x4),
+                    VecExtendOp::Sxtl32 => ("sxtl", VectorSize::Size64x2, VectorSize::Size32x2),
+                    VecExtendOp::Uxtl8 => ("uxtl", VectorSize::Size16x8, VectorSize::Size8x8),
+                    VecExtendOp::Uxtl16 => ("uxtl", VectorSize::Size32x4, VectorSize::Size16x4),
+                    VecExtendOp::Uxtl32 => ("uxtl", VectorSize::Size64x2, VectorSize::Size32x2),
                };
                let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest);
                let rn = show_vreg_vector(rn, mb_rru, src);
@@ -2755,72 +2749,54 @@ impl Inst {
                rn,
                rm,
                alu_op,
-                ty,
+                size,
            } => {
-                let (op, vector, ty) = match alu_op {
-                    VecALUOp::SQAddScalar => ("sqadd", false, ty),
-                    VecALUOp::Sqadd => ("sqadd", true, ty),
-                    VecALUOp::UQAddScalar => ("uqadd", false, ty),
-                    VecALUOp::Uqadd => ("uqadd", true, ty),
-                    VecALUOp::SQSubScalar => ("sqsub", false, ty),
-                    VecALUOp::Sqsub => ("sqsub", true, ty),
-                    VecALUOp::UQSubScalar => ("uqsub", false, ty),
-                    VecALUOp::Uqsub => ("uqsub", true, ty),
-                    VecALUOp::Cmeq => ("cmeq", true, ty),
-                    VecALUOp::Cmge => ("cmge", true, ty),
-                    VecALUOp::Cmgt => ("cmgt", true, ty),
-                    VecALUOp::Cmhs => ("cmhs", true, ty),
-                    VecALUOp::Cmhi => ("cmhi", true, ty),
-                    VecALUOp::Fcmeq => ("fcmeq", true, ty),
-                    VecALUOp::Fcmgt => ("fcmgt", true, ty),
-                    VecALUOp::Fcmge => ("fcmge", true, ty),
-                    VecALUOp::And => ("and", true, I8X16),
-                    VecALUOp::Bic => ("bic", true, I8X16),
-                    VecALUOp::Orr => ("orr", true, I8X16),
-                    VecALUOp::Eor => ("eor", true, I8X16),
-                    VecALUOp::Bsl => ("bsl", true, I8X16),
-                    VecALUOp::Umaxp => ("umaxp", true, ty),
-                    VecALUOp::Add => ("add", true, ty),
-                    VecALUOp::Sub => ("sub", true, ty),
-                    VecALUOp::Mul => ("mul", true, ty),
-                    VecALUOp::Sshl => ("sshl", true, ty),
-                    VecALUOp::Ushl => ("ushl", true, ty),
+                let (op, size) = match alu_op {
+                    VecALUOp::Sqadd => ("sqadd", size),
+                    VecALUOp::Uqadd => ("uqadd", size),
+                    VecALUOp::Sqsub => ("sqsub", size),
+                    VecALUOp::Uqsub => ("uqsub", size),
+                    VecALUOp::Cmeq => ("cmeq", size),
+                    VecALUOp::Cmge => ("cmge", size),
+                    VecALUOp::Cmgt => ("cmgt", size),
+                    VecALUOp::Cmhs => ("cmhs", size),
+                    VecALUOp::Cmhi => ("cmhi", size),
+                    VecALUOp::Fcmeq => ("fcmeq", size),
+                    VecALUOp::Fcmgt => ("fcmgt", size),
+                    VecALUOp::Fcmge => ("fcmge", size),
+                    VecALUOp::And => ("and", VectorSize::Size8x16),
+                    VecALUOp::Bic => ("bic", VectorSize::Size8x16),
+                    VecALUOp::Orr => ("orr", VectorSize::Size8x16),
+                    VecALUOp::Eor => ("eor", VectorSize::Size8x16),
+                    VecALUOp::Bsl => ("bsl", VectorSize::Size8x16),
+                    VecALUOp::Umaxp => ("umaxp", size),
+                    VecALUOp::Add => ("add", size),
+                    VecALUOp::Sub => ("sub", size),
+                    VecALUOp::Mul => ("mul", size),
+                    VecALUOp::Sshl => ("sshl", size),
+                    VecALUOp::Ushl => ("ushl", size),
                };
-
-                let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector {
-                    |reg, mb_rru, ty| show_vreg_vector(reg, mb_rru, ty)
-                } else {
-                    |reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru, ScalarSize::Size64)
-                };
-
-                let rd = show_vreg_fn(rd.to_reg(), mb_rru, ty);
-                let rn = show_vreg_fn(rn, mb_rru, ty);
-                let rm = show_vreg_fn(rm, mb_rru, ty);
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_vector(rn, mb_rru, size);
+                let rm = show_vreg_vector(rm, mb_rru, size);
                format!("{} {}, {}, {}", op, rd, rn, rm)
            }
-            &Inst::VecMisc { op, rd, rn, ty } => {
-                let (op, ty) = match op {
-                    VecMisc2::Not => ("mvn", I8X16),
-                    VecMisc2::Neg => ("neg", ty),
+            &Inst::VecMisc { op, rd, rn, size } => {
+                let (op, size) = match op {
+                    VecMisc2::Not => ("mvn", VectorSize::Size8x16),
+                    VecMisc2::Neg => ("neg", size),
                };

-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, ty);
-                let rn = show_vreg_vector(rn, mb_rru, ty);
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_vector(rn, mb_rru, size);
                format!("{} {}, {}", op, rd, rn)
            }
-            &Inst::VecLanes { op, rd, rn, ty } => {
+            &Inst::VecLanes { op, rd, rn, size } => {
                let op = match op {
                    VecLanesOp::Uminv => "uminv",
                };
-                let size = match ty {
-                    I8X16 => ScalarSize::Size8,
-                    I16X8 => ScalarSize::Size16,
-                    I32X4 => ScalarSize::Size32,
-                    _ => unimplemented!(),
-                };
-
-                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
-                let rn = show_vreg_vector(rn, mb_rru, ty);
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size.lane_size());
+                let rn = show_vreg_vector(rn, mb_rru, size);
                format!("{} {}, {}", op, rd, rn)
            }
            &Inst::MovToNZCV { rn } => {
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -1,8 +1,8 @@
 //! AArch64 ISA definitions: registers.

-use crate::ir::types::*;
 use crate::isa::aarch64::inst::OperandSize;
 use crate::isa::aarch64::inst::ScalarSize;
+use crate::isa::aarch64::inst::VectorSize;
 use crate::machinst::*;
 use crate::settings;

@@ -307,40 +307,42 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: Scalar
 }

 /// Show a vector register.
-pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String {
+pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: VectorSize) -> String {
    assert_eq!(RegClass::V128, reg.get_class());
    let mut s = reg.show_rru(mb_rru);

-    match ty {
-        F32X2 => s.push_str(".2s"),
-        F32X4 => s.push_str(".4s"),
-        F64X2 => s.push_str(".2d"),
-        I8X8 => s.push_str(".8b"),
-        I8X16 => s.push_str(".16b"),
-        I16X4 => s.push_str(".4h"),
-        I16X8 => s.push_str(".8h"),
-        I32X2 => s.push_str(".2s"),
-        I32X4 => s.push_str(".4s"),
-        I64X2 => s.push_str(".2d"),
-        _ => unimplemented!(),
-    }
+    let suffix = match size {
+        VectorSize::Size8x8 => ".8b",
+        VectorSize::Size8x16 => ".16b",
+        VectorSize::Size16x4 => ".4h",
+        VectorSize::Size16x8 => ".8h",
+        VectorSize::Size32x2 => ".2s",
+        VectorSize::Size32x4 => ".4s",
+        VectorSize::Size64x2 => ".2d",
+    };

+    s.push_str(suffix);
    s
 }

 /// Show an indexed vector element.
-pub fn show_vreg_element(reg: Reg, mb_rru: Option<&RealRegUniverse>, idx: u8, ty: Type) -> String {
+pub fn show_vreg_element(
+    reg: Reg,
+    mb_rru: Option<&RealRegUniverse>,
+    idx: u8,
+    size: VectorSize,
+) -> String {
    assert_eq!(RegClass::V128, reg.get_class());
    let mut s = reg.show_rru(mb_rru);

-    let suffix = match ty {
-        I8 => "b",
-        I16 => "h",
-        I32 => "s",
-        I64 => "d",
-        F32 => "s",
-        F64 => "d",
-        _ => unimplemented!(),
+    let suffix = match size {
+        VectorSize::Size8x8 => "b",
+        VectorSize::Size8x16 => "b",
+        VectorSize::Size16x4 => "h",
+        VectorSize::Size16x8 => "h",
+        VectorSize::Size32x2 => "s",
+        VectorSize::Size32x4 => "s",
+        VectorSize::Size64x2 => "d",
    };

    s.push_str(&format!(".{}[{}]", suffix, idx));
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -14,7 +14,7 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
-use crate::{CodegenError, CodegenResult};
+use crate::CodegenResult;

 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::AArch64Backend;
@@ -736,20 +736,11 @@ pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
    ty: Type,
    cond: Cond,
 ) -> CodegenResult<()> {
-    match ty {
-        F32X4 | F64X2 | I8X16 | I16X8 | I32X4 => {}
-        _ => {
-            return Err(CodegenError::Unsupported(format!(
-                "unsupported SIMD type: {:?}",
-                ty
-            )));
-        }
-    };
-
    let is_float = match ty {
        F32X4 | F64X2 => true,
        _ => false,
    };
+    let size = VectorSize::from_ty(ty);
    // 'Less than' operations are implemented by swapping
    // the order of operands and using the 'greater than'
    // instructions.
@@ -784,7 +775,7 @@ pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
        rd,
        rn,
        rm,
-        ty,
+        size,
    });

    if cond == Cond::Ne {
@@ -792,7 +783,7 @@ pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
            op: VecMisc2::Not,
            rd,
            rn: rd.to_reg(),
-            ty: I8X16,
+            size,
        });
    }

--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -70,7 +70,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    rn,
                    rm,
                    alu_op: VecALUOp::Add,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                });
            }
        }
@@ -89,13 +89,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    rn,
                    rm,
                    alu_op: VecALUOp::Sub,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                });
            }
        }
        Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
-            // We use the vector instruction set's saturating adds (UQADD /
-            // SQADD), which require vector registers.
+            // We use the scalar SIMD & FP saturating additions and subtractions
+            // (SQADD / UQADD / SQSUB / UQSUB), which require scalar FP registers.
            let is_signed = op == Opcode::SaddSat || op == Opcode::SsubSat;
            let ty = ty.unwrap();
            let rd = get_output_reg(ctx, outputs[0]);
@@ -105,11 +105,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                } else {
                    NarrowValueMode::ZeroExtend64
                };
-                let alu_op = match op {
-                    Opcode::UaddSat => VecALUOp::UQAddScalar,
-                    Opcode::SaddSat => VecALUOp::SQAddScalar,
-                    Opcode::UsubSat => VecALUOp::UQSubScalar,
-                    Opcode::SsubSat => VecALUOp::SQSubScalar,
+                let fpu_op = match op {
+                    Opcode::UaddSat => FPUOp2::Uqadd64,
+                    Opcode::SaddSat => FPUOp2::Sqadd64,
+                    Opcode::UsubSat => FPUOp2::Uqsub64,
+                    Opcode::SsubSat => FPUOp2::Sqsub64,
                    _ => unreachable!(),
                };
                let va = ctx.alloc_tmp(RegClass::V128, I128);
@@ -118,18 +118,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
                ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
                ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
-                ctx.emit(Inst::VecRRR {
+                ctx.emit(Inst::FpuRRR {
+                    fpu_op,
                    rd: va,
                    rn: va.to_reg(),
                    rm: vb.to_reg(),
-                    alu_op,
-                    ty: I64,
                });
                ctx.emit(Inst::MovFromVec {
                    rd,
                    rn: va.to_reg(),
                    idx: 0,
-                    ty: I64,
+                    size: VectorSize::Size64x2,
                });
            } else {
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -148,7 +147,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    rn,
                    rm,
                    alu_op,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                });
            }
        }
@@ -167,7 +166,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    op: VecMisc2::Neg,
                    rd,
                    rn,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                });
            }
        }
@@ -192,7 +191,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    rd,
                    rn,
                    rm,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                });
            }
        }
@@ -422,7 +421,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    op: VecMisc2::Not,
                    rd,
                    rn: rm,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                });
            }
        }
@@ -466,7 +465,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    rd,
                    rn,
                    rm,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                });
            }
        }
@@ -495,7 +494,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
            } else {
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-
+                let size = VectorSize::from_ty(ty);
                let (alu_op, is_right_shift) = match op {
                    Opcode::Ishl => (VecALUOp::Sshl, false),
                    Opcode::Ushr => (VecALUOp::Ushl, true),
@@ -514,18 +513,14 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
                };

-                ctx.emit(Inst::VecDup {
-                    rd,
-                    rn: rm,
-                    ty: ty.lane_type(),
-                });
+                ctx.emit(Inst::VecDup { rd, rn: rm, size });

                ctx.emit(Inst::VecRRR {
                    alu_op,
                    rd,
                    rn,
                    rm: rd.to_reg(),
-                    ty,
+                    size,
                });
            }
        }
@@ -1167,7 +1162,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    rd,
                    rn,
                    rm,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                });
            }
        }
@@ -1297,7 +1292,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        rd,
                        rn,
                        idx: 0,
-                        ty: I64,
+                        size: VectorSize::Size64x2,
                    });
                }
            }
@@ -1557,15 +1552,15 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let idx = *imm;
                let rd = get_output_reg(ctx, outputs[0]);
                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
                let ty = ty.unwrap();

                if ty_is_int(ty) {
-                    ctx.emit(Inst::MovFromVec { rd, rn, idx, ty });
+                    ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
                // Plain moves are faster on some processors.
                } else if idx == 0 {
                    ctx.emit(Inst::gen_move(rd, rn, ty));
                } else {
-                    let size = ScalarSize::from_ty(ty);
                    ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
                }
            } else {
@@ -1576,11 +1571,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        Opcode::Splat => {
            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]);
-            let ty = ctx.input_ty(insn, 0);
-            let inst = if ty_is_int(ty) {
-                Inst::VecDup { rd, rn, ty }
+            let input_ty = ctx.input_ty(insn, 0);
+            let size = VectorSize::from_ty(ty.unwrap());
+            let inst = if ty_is_int(input_ty) {
+                Inst::VecDup { rd, rn, size }
            } else {
-                Inst::VecDupFromFpu { rd, rn, ty }
+                Inst::VecDupFromFpu { rd, rn, size }
            };
            ctx.emit(inst);
        }
@@ -1598,21 +1594,22 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            // cmp xm, #0
            // cset xm, ne

-            let input_ty = ctx.input_ty(insn, 0);
+            let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
+
            if op == Opcode::VanyTrue {
                ctx.emit(Inst::VecRRR {
                    alu_op: VecALUOp::Umaxp,
                    rd: tmp,
                    rn: rm,
                    rm: rm,
-                    ty: input_ty,
+                    size,
                });
            } else {
                ctx.emit(Inst::VecLanes {
                    op: VecLanesOp::Uminv,
                    rd: tmp,
                    rn: rm,
-                    ty: input_ty,
+                    size,
                });
            };

@@ -1620,7 +1617,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                rd,
                rn: tmp.to_reg(),
                idx: 0,
-                ty: I64,
+                size: VectorSize::Size64x2,
            });

            ctx.emit(Inst::AluRRImm12 {
@@ -2136,6 +2133,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::X86Insertps
        | Opcode::X86Movsd
        | Opcode::X86Movlhps
+        | Opcode::X86Palignr
        | Opcode::X86Psll
        | Opcode::X86Psrl
        | Opcode::X86Psra
@@ -2156,7 +2154,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        Opcode::AvgRound => unimplemented!(),
        Opcode::Iabs => unimplemented!(),
-        Opcode::Snarrow | Opcode::Unarrow => unimplemented!(),
+        Opcode::Snarrow
+        | Opcode::Unarrow
+        | Opcode::SwidenLow
+        | Opcode::SwidenHigh
+        | Opcode::UwidenLow
+        | Opcode::UwidenHigh => unimplemented!(),
        Opcode::TlsValue => unimplemented!(),
    }

--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -121,8 +121,12 @@ pub fn lookup(triple: Triple) -> Result<Builder, LookupError> {
    match triple.architecture {
        Architecture::Riscv32 | Architecture::Riscv64 => isa_builder!(riscv, "riscv", triple),
        Architecture::I386 | Architecture::I586 | Architecture::I686 | Architecture::X86_64 => {
+            if cfg!(feature = "x64") {
+                isa_builder!(x64, "x64", triple)
+            } else {
                isa_builder!(x86, "x86", triple)
            }
+        }
        Architecture::Arm { .. } => isa_builder!(arm32, "arm32", triple),
        Architecture::Aarch64 { .. } => isa_builder!(aarch64, "arm64", triple),
        _ => Err(LookupError::Unsupported),
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -11,28 +11,33 @@ use crate::isa::Builder as IsaBuilder;
 use crate::machinst::pretty_print::ShowWithRRU;
 use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
 use crate::result::CodegenResult;
-use crate::settings::{self, Flags};
+use crate::settings::{self as shared_settings, Flags};

-use crate::isa::x64::inst::regs::create_reg_universe_systemv;
+use crate::isa::x64::{inst::regs::create_reg_universe_systemv, settings as x64_settings};
+
+use super::TargetIsa;

 mod abi;
 mod inst;
 mod lower;
+mod settings;

 /// An X64 backend.
 pub(crate) struct X64Backend {
    triple: Triple,
    flags: Flags,
+    _x64_flags: x64_settings::Flags,
    reg_universe: RealRegUniverse,
 }

 impl X64Backend {
    /// Create a new X64 backend with the given (shared) flags.
-    fn new_with_flags(triple: Triple, flags: Flags) -> Self {
+    fn new_with_flags(triple: Triple, flags: Flags, x64_flags: x64_settings::Flags) -> Self {
        let reg_universe = create_reg_universe_systemv(&flags);
        Self {
            triple,
            flags,
+            _x64_flags: x64_flags,
            reg_universe,
        }
    }
@@ -103,10 +108,17 @@ impl MachBackend for X64Backend {
 pub(crate) fn isa_builder(triple: Triple) -> IsaBuilder {
    IsaBuilder {
        triple,
-        setup: settings::builder(),
-        constructor: |triple: Triple, flags: Flags, _arch_flag_builder: settings::Builder| {
-            let backend = X64Backend::new_with_flags(triple, flags);
-            Box::new(TargetIsaAdapter::new(backend))
-        },
+        setup: x64_settings::builder(),
+        constructor: isa_constructor,
    }
 }
+
+fn isa_constructor(
+    triple: Triple,
+    shared_flags: Flags,
+    builder: shared_settings::Builder,
+) -> Box<dyn TargetIsa> {
+    let isa_flags = x64_settings::Flags::new(&shared_flags, builder);
+    let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags);
+    Box::new(TargetIsaAdapter::new(backend))
+}
--- a/cranelift/codegen/src/isa/x64/settings.rs
+++ b/cranelift/codegen/src/isa/x64/settings.rs
@@ -0,0 +1,9 @@
+//! x86 Settings.
+
+use crate::settings::{self, detail, Builder};
+use core::fmt;
+
+// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
+// public `Flags` struct with an impl for all of the settings defined in
+// `cranelift-codegen/meta/src/isa/x86/settings.rs`.
+include!(concat!(env!("OUT_DIR"), "/settings-x86.rs"));
--- a/cranelift/codegen/src/isa/x86/mod.rs
+++ b/cranelift/codegen/src/isa/x86/mod.rs
@@ -57,20 +57,12 @@ fn isa_constructor(

    let isa_flags = settings::Flags::new(&shared_flags, builder);

-    if isa_flags.use_new_backend() {
-        #[cfg(not(feature = "x64"))]
-        panic!("new backend x86 support not included by cargo features!");
-
-        #[cfg(feature = "x64")]
-        super::x64::isa_builder(triple).finish(shared_flags)
-    } else {
    Box::new(Isa {
        triple,
        isa_flags,
        shared_flags,
        cpumode: level1,
    })
-    }
 }

 impl TargetIsa for Isa {
--- a/cranelift/codegen/src/legalizer/mod.rs
+++ b/cranelift/codegen/src/legalizer/mod.rs
@@ -19,10 +19,24 @@ use crate::flowgraph::ControlFlowGraph;
 use crate::ir::types::{I32, I64};
 use crate::ir::{self, InstBuilder, MemFlags};
 use crate::isa::TargetIsa;
+
+#[cfg(any(
+    feature = "x86",
+    feature = "arm32",
+    feature = "arm64",
+    feature = "riscv"
+))]
 use crate::predicates;
+#[cfg(any(
+    feature = "x86",
+    feature = "arm32",
+    feature = "arm64",
+    feature = "riscv"
+))]
+use alloc::vec::Vec;
+
 use crate::timing;
 use alloc::collections::BTreeSet;
-use alloc::vec::Vec;

 mod boundary;
 mod call;
--- a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
@@ -1,5 +1,6 @@
 test binemit
-target x86_64
+set enable_simd
+target x86_64 nehalem

 ; Ensure raw_bitcast emits no instructions.
 function %raw_bitcast_i16x8_to_b32x4() {
@@ -10,8 +11,16 @@ block0:
            return
 }

-function %fcvt_32(i32x4) {
-block0(v0: i32x4 [%xmm6]):
-[-, %xmm2]  v1 = fcvt_from_sint.f32x4 v0    ; bin: 40 0f 5b d6
+function %conversions_i32x4(i32x4, i32x4) {
+block0(v0: i32x4 [%xmm6], v1: i32x4 [%xmm4]):
+[-, %xmm2]  v2 = fcvt_from_sint.f32x4 v0    ; bin: 40 0f 5b d6
+[-, %xmm6]  v3 = x86_palignr v0, v1, 3      ; bin: 66 0f 3a 0f f4 03
+            return
+}
+
+function %conversions_i16x8(i16x8) {
+block0(v0: i16x8 [%xmm6]):
+[-, %xmm2]  v1 = swiden_low v0              ; bin: 66 0f 38 23 d6
+[-, %xmm11] v2 = uwiden_low v0              ; bin: 66 44 0f 38 33 de
            return
 }
--- a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif
@@ -52,3 +52,19 @@ block0(v0:f32x4):
    ; nextln: v1 = iadd v12, v11
    return v1
 }
+
+function %uwiden_high(i8x16) -> i16x8 {
+block0(v0: i8x16):
+    v1 = uwiden_high v0
+    ; check: v2 = x86_palignr v0, v0, 8
+    ; nextln: v1 = uwiden_low v2
+    return v1
+}
+
+function %swiden_high(i16x8) -> i32x4 {
+block0(v0: i16x8):
+    v1 = swiden_high v0
+    ; check: v2 = x86_palignr v0, v0, 8
+    ; nextln: v1 = swiden_low v2
+    return v1
+}
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1582,17 +1582,39 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
            let (a, b) = pop2_with_bitcast(state, I32X4, builder);
            state.push1(builder.ins().unarrow(a, b))
        }
-        Operator::I16x8WidenLowI8x16S { .. }
-        | Operator::I16x8WidenHighI8x16S { .. }
-        | Operator::I16x8WidenLowI8x16U { .. }
-        | Operator::I16x8WidenHighI8x16U { .. }
-        | Operator::I32x4WidenLowI16x8S { .. }
-        | Operator::I32x4WidenHighI16x8S { .. }
-        | Operator::I32x4WidenLowI16x8U { .. }
-        | Operator::I32x4WidenHighI16x8U { .. }
-        | Operator::I8x16Bitmask
-        | Operator::I16x8Bitmask
-        | Operator::I32x4Bitmask => {
+        Operator::I16x8WidenLowI8x16S => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().swiden_low(a))
+        }
+        Operator::I16x8WidenHighI8x16S => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().swiden_high(a))
+        }
+        Operator::I16x8WidenLowI8x16U => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().uwiden_low(a))
+        }
+        Operator::I16x8WidenHighI8x16U => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().uwiden_high(a))
+        }
+        Operator::I32x4WidenLowI16x8S => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().swiden_low(a))
+        }
+        Operator::I32x4WidenHighI16x8S => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().swiden_high(a))
+        }
+        Operator::I32x4WidenLowI16x8U => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().uwiden_low(a))
+        }
+        Operator::I32x4WidenHighI16x8U => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().uwiden_high(a))
+        }
+        Operator::I8x16Bitmask | Operator::I16x8Bitmask | Operator::I32x4Bitmask => {
            return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
        }

--- a/crates/c-api/include/wasmtime.h
+++ b/crates/c-api/include/wasmtime.h
@@ -515,8 +515,7 @@ typedef own wasm_trap_t* (*wasmtime_func_callback_t)(const wasmtime_caller_t* ca
 *
 * This function is the same as #wasm_func_callback_with_env_t except that its
 * first argument is a #wasmtime_caller_t which allows learning information
- * about the
- * caller.
+ * about the caller.
 */
 typedef own wasm_trap_t* (*wasmtime_func_callback_with_env_t)(const wasmtime_caller_t* caller, void* env, const wasm_val_t args[], wasm_val_t results[]);

@@ -544,6 +543,28 @@ WASM_API_EXTERN own wasm_func_t* wasmtime_func_new_with_env(
  void (*finalizer)(void*)
 );

+/**
+ * \brief Creates a new `funcref` value referencing `func`.
+ *
+ * Create a `funcref` value that references `func` and writes it to `funcrefp`.
+ *
+ * Gives ownership fo the `funcref` value written to `funcrefp`.
+ *
+ * Both `func` and `funcrefp` must not be NULL.
+ */
+WASM_API_EXTERN void wasmtime_func_as_funcref(const wasm_func_t* func, wasm_val_t* funcrefp);
+
+/**
+ * \brief Get the `wasm_func_t*` referenced by the given `funcref` value.
+ *
+ * Gets an owning handle to the `wasm_func_t*` that the given `funcref` value is
+ * referencing. Returns NULL if the value is not a `funcref`, or if the value is
+ * a null function reference.
+ *
+ * The `val` pointer must not be NULL.
+ */
+WASM_API_EXTERN own wasm_func_t* wasmtime_funcref_as_func(const wasm_val_t* val);
+
 /**
 * \brief Loads a #wasm_extern_t from the caller's context
 *
@@ -740,7 +761,7 @@ WASM_API_EXTERN own wasmtime_error_t *wasmtime_instance_new(
 * returned error and module are owned by the caller.
 */
 WASM_API_EXTERN own wasmtime_error_t *wasmtime_module_new(
-    wasm_store_t *store,
+    wasm_engine_t *engine,
    const wasm_byte_vec_t *binary,
    own wasm_module_t **ret
 );
@@ -845,8 +866,10 @@ WASM_API_EXTERN wasmtime_error_t *wasmtime_funcref_table_grow(
 * This function does not take an associated finalizer to clean up the data when
 * the reference is reclaimed. If you need a finalizer to clean up the data,
 * then use #wasmtime_externref_new_with_finalizer.
+ *
+ * Gives ownership of the newly created `externref` value.
 */
-WASM_API_EXTERN void wasmtime_externref_new(void *data, wasm_val_t *valp);
+WASM_API_EXTERN void wasmtime_externref_new(own void *data, wasm_val_t *valp);

 /**
 * \brief A finalizer for an `externref`'s wrapped data.
@@ -866,9 +889,11 @@ typedef void (*wasmtime_externref_finalizer_t)(void*);
 * When the reference is reclaimed, the wrapped data is cleaned up with the
 * provided finalizer. If you do not need to clean up the wrapped data, then use
 * #wasmtime_externref_new.
+ *
+ * Gives ownership of the newly created `externref` value.
 */
 WASM_API_EXTERN void wasmtime_externref_new_with_finalizer(
-    void *data,
+    own void *data,
    wasmtime_externref_finalizer_t finalizer,
    wasm_val_t *valp
 );
@@ -887,7 +912,8 @@ WASM_API_EXTERN void wasmtime_externref_new_with_finalizer(
 * If the given value is not an `externref`, returns `false` and leaves `datap`
 * unmodified.
 *
- * Does not take ownership of `val`.
+ * Does not take ownership of `val`. Does not give up ownership of the `void*`
+ * data written to `datap`.
 *
 * Both `val` and `datap` must not be `NULL`.
 */
--- a/crates/c-api/src/func.rs
+++ b/crates/c-api/src/func.rs
@@ -6,7 +6,7 @@ use std::mem::MaybeUninit;
 use std::panic::{self, AssertUnwindSafe};
 use std::ptr;
 use std::str;
-use wasmtime::{Caller, Extern, Func, Trap};
+use wasmtime::{Caller, Extern, Func, Trap, Val};

 #[derive(Clone)]
 #[repr(transparent)]
@@ -275,3 +275,21 @@ pub extern "C" fn wasmtime_caller_export_get(
    let which = caller.caller.get_export(name)?;
    Some(Box::new(wasm_extern_t { which }))
 }
+
+#[no_mangle]
+pub extern "C" fn wasmtime_func_as_funcref(
+    func: &wasm_func_t,
+    funcrefp: &mut MaybeUninit<wasm_val_t>,
+) {
+    let funcref = wasm_val_t::from_val(Val::FuncRef(Some(func.func().clone())));
+    crate::initialize(funcrefp, funcref);
+}
+
+#[no_mangle]
+pub extern "C" fn wasmtime_funcref_as_func(val: &wasm_val_t) -> Option<Box<wasm_func_t>> {
+    if let Val::FuncRef(Some(f)) = val.val() {
+        Some(Box::new(f.into()))
+    } else {
+        None
+    }
+}
--- a/crates/c-api/src/module.rs
+++ b/crates/c-api/src/module.rs
@@ -1,6 +1,6 @@
 use crate::{
-    handle_result, wasm_byte_vec_t, wasm_exporttype_t, wasm_exporttype_vec_t, wasm_importtype_t,
-    wasm_importtype_vec_t, wasm_store_t, wasmtime_error_t,
+    handle_result, wasm_byte_vec_t, wasm_engine_t, wasm_exporttype_t, wasm_exporttype_vec_t,
+    wasm_importtype_t, wasm_importtype_vec_t, wasm_store_t, wasmtime_error_t,
 };
 use std::ptr;
 use wasmtime::{Engine, Module};
@@ -29,7 +29,10 @@ pub extern "C" fn wasm_module_new(
    binary: &wasm_byte_vec_t,
 ) -> Option<Box<wasm_module_t>> {
    let mut ret = ptr::null_mut();
-    match wasmtime_module_new(store, binary, &mut ret) {
+    let engine = wasm_engine_t {
+        engine: store.store.engine().clone(),
+    };
+    match wasmtime_module_new(&engine, binary, &mut ret) {
        Some(_err) => None,
        None => {
            assert!(!ret.is_null());
@@ -40,13 +43,12 @@ pub extern "C" fn wasm_module_new(

 #[no_mangle]
 pub extern "C" fn wasmtime_module_new(
-    store: &wasm_store_t,
+    engine: &wasm_engine_t,
    binary: &wasm_byte_vec_t,
    ret: &mut *mut wasm_module_t,
 ) -> Option<Box<wasmtime_error_t>> {
    let binary = binary.as_slice();
-    let store = &store.store;
-    handle_result(Module::from_binary(store.engine(), binary), |module| {
+    handle_result(Module::from_binary(&engine.engine, binary), |module| {
        let imports = module
            .imports()
            .map(|i| wasm_importtype_t::new(i.module().to_owned(), i.name().to_owned(), i.ty()))
--- a/crates/c-api/src/table.rs
+++ b/crates/c-api/src/table.rs
@@ -91,7 +91,7 @@ pub extern "C" fn wasm_table_get(
    index: wasm_table_size_t,
 ) -> Option<Box<wasm_ref_t>> {
    let val = t.table().get(index)?;
-    Some(val_into_ref(val).unwrap())
+    val_into_ref(val)
 }

 #[no_mangle]
--- a/crates/c-api/src/val.rs
+++ b/crates/c-api/src/val.rs
@@ -26,7 +26,9 @@ impl Drop for wasm_val_t {
    fn drop(&mut self) {
        match into_valtype(self.kind) {
            ValType::ExternRef => unsafe {
+                if !self.of.ref_.is_null() {
                    drop(Box::from_raw(self.of.ref_));
+                }
            },
            _ => {}
        }
@@ -116,7 +118,20 @@ impl wasm_val_t {
            ValType::I64 => Val::from(unsafe { self.of.i64 }),
            ValType::F32 => Val::from(unsafe { self.of.f32 }),
            ValType::F64 => Val::from(unsafe { self.of.f64 }),
-            ValType::ExternRef | ValType::FuncRef => ref_to_val(unsafe { &*self.of.ref_ }),
+            ValType::ExternRef => unsafe {
+                if self.of.ref_.is_null() {
+                    Val::ExternRef(None)
+                } else {
+                    ref_to_val(&*self.of.ref_)
+                }
+            },
+            ValType::FuncRef => unsafe {
+                if self.of.ref_.is_null() {
+                    Val::FuncRef(None)
+                } else {
+                    ref_to_val(&*self.of.ref_)
+                }
+            },
            _ => unimplemented!("wasm_val_t::val {:?}", self.kind),
        }
    }
--- a/crates/jit/src/link.rs
+++ b/crates/jit/src/link.rs
@@ -98,12 +98,13 @@ fn apply_reloc(
            write_unaligned(reloc_address as *mut u32, reloc_delta_u32);
        },
        #[cfg(target_pointer_width = "64")]
-        (RelocationKind::Relative, RelocationEncoding::X86Branch, 32) => unsafe {
+        (RelocationKind::Relative, RelocationEncoding::Generic, 32) => unsafe {
            let reloc_address = body.add(offset as usize) as usize;
            let reloc_addend = r.addend() as isize;
            let reloc_delta_u64 = (target_func_address as u64)
                .wrapping_sub(reloc_address as u64)
                .wrapping_add(reloc_addend as u64);
+            // TODO implement far calls mode in x64 new backend.
            assert!(
                reloc_delta_u64 as isize <= i32::max_value() as isize,
                "relocation too large to fit in i32"
--- a/docs/stability-platform-support.md
+++ b/docs/stability-platform-support.md
@@ -10,6 +10,7 @@ snapshot of what the current state of the world looks like.
 All features of `wasmtime` should work on the following platforms:

 * Linux x86\_64
+* Linux aarch64
 * macOS x86\_64
 * Windows x86\_64

@@ -18,9 +19,8 @@ sections below!

 ## JIT compiler support

-The JIT compiler, backed by either `lightbeam` or `cranelift` supports only the
-x86\_64 architecture at this time. Support for at least ARM, AArch64, and x86 is
-planned at this time.
+The JIT compiler, backed by Cranelift, supports the x86\_64 and aarch64
+architectures at this time. Support for at least ARM and x86 is planned as well.

 Usage of the JIT compiler will require a host operating system which supports
 creating executable memory pages on-the-fly. In Rust terms this generally means
@@ -39,5 +39,6 @@ much else will be needed.
 The `wasmtime` project does not currently use `#[no_std]` for its crates, but
 this is not because it won't support it! At this time we're still gathering use
 cases for for what `#[no_std]` might entail, so if you're interested in this
-we'd love to hear about your use case! Feel free to open an issue on the
+we'd love to hear about your use case! Feel free to [open an
+issue](https://github.com/bytecodealliance/wasmtime/issues/new) on the
 `wasmtime` repository to discuss this.
--- a/examples/externref.c
+++ b/examples/externref.c
@@ -66,7 +66,7 @@ int main() {
  // Now that we've got our binary webassembly we can compile our module.
  printf("Compiling module...\n");
  wasm_module_t *module = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
  wasm_byte_vec_delete(&wasm);
  if (error != NULL)
    exit_with_error("failed to compile module", error, NULL);
--- a/examples/fib-debug/main.c
+++ b/examples/fib-debug/main.c
@@ -43,7 +43,7 @@ int main(int argc, const char* argv[]) {
  // Compile.
  printf("Compiling module...\n");
  wasm_module_t *module = NULL;
-  wasmtime_error_t* error = wasmtime_module_new(store, &binary, &module);
+  wasmtime_error_t* error = wasmtime_module_new(engine, &binary, &module);
  if (!module)
    exit_with_error("failed to compile module", error, NULL);
  wasm_byte_vec_delete(&binary);
--- a/examples/gcd.c
+++ b/examples/gcd.c
@@ -59,7 +59,7 @@ int main() {

  // Compile and instantiate our module
  wasm_module_t *module = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
  if (module == NULL)
    exit_with_error("failed to compile module", error, NULL);
  wasm_byte_vec_delete(&wasm);
--- a/examples/hello.c
+++ b/examples/hello.c
@@ -67,7 +67,7 @@ int main() {
  // Now that we've got our binary webassembly we can compile our module.
  printf("Compiling module...\n");
  wasm_module_t *module = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
  wasm_byte_vec_delete(&wasm);
  if (error != NULL)
    exit_with_error("failed to compile module", error, NULL);
--- a/examples/hello.cc
+++ b/examples/hello.cc
@@ -67,7 +67,7 @@ int main() {
  // Now that we've got our binary webassembly we can compile our module.
  printf("Compiling module...\n");
  wasm_module_t *module = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
  wasm_byte_vec_delete(&wasm);
  if (error != NULL)
    exit_with_error("failed to compile module", error, NULL);
--- a/examples/interrupt.c
+++ b/examples/interrupt.c
@@ -89,7 +89,7 @@ int main() {
  wasm_module_t *module = NULL;
  wasm_trap_t *trap = NULL;
  wasm_instance_t *instance = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
  wasm_byte_vec_delete(&wasm);
  if (error != NULL)
    exit_with_error("failed to compile module", error, NULL);
--- a/examples/linking.c
+++ b/examples/linking.c
@@ -45,10 +45,10 @@ int main() {
  wasmtime_error_t *error;
  wasm_module_t *linking1_module = NULL;
  wasm_module_t *linking2_module = NULL;
-  error = wasmtime_module_new(store, &linking1_wasm, &linking1_module);
+  error = wasmtime_module_new(engine, &linking1_wasm, &linking1_module);
  if (error != NULL)
    exit_with_error("failed to compile linking1", error, NULL);
-  error = wasmtime_module_new(store, &linking2_wasm, &linking2_module);
+  error = wasmtime_module_new(engine, &linking2_wasm, &linking2_module);
  if (error != NULL)
    exit_with_error("failed to compile linking2", error, NULL);
  wasm_byte_vec_delete(&linking1_wasm);
--- a/examples/memory.c
+++ b/examples/memory.c
@@ -158,7 +158,7 @@ int main(int argc, const char* argv[]) {
  // Compile.
  printf("Compiling module...\n");
  wasm_module_t* module = NULL;
-  error = wasmtime_module_new(store, &binary, &module);
+  error = wasmtime_module_new(engine, &binary, &module);
  if (error)
    exit_with_error("failed to compile module", error, NULL);
  wasm_byte_vec_delete(&binary);
--- a/examples/multi.c
+++ b/examples/multi.c
@@ -91,7 +91,7 @@ int main(int argc, const char* argv[]) {
  // Compile.
  printf("Compiling module...\n");
  wasm_module_t* module = NULL;
-  error = wasmtime_module_new(store, &binary, &module);
+  error = wasmtime_module_new(engine, &binary, &module);
  if (error)
    exit_with_error("failed to compile module", error, NULL);

--- a/examples/wasi/main.c
+++ b/examples/wasi/main.c
@@ -54,7 +54,7 @@ int main() {

  // Compile our modules
  wasm_module_t *module = NULL;
-  wasmtime_error_t *error = wasmtime_module_new(store, &wasm, &module);
+  wasmtime_error_t *error = wasmtime_module_new(engine, &wasm, &module);
  if (!module)
    exit_with_error("failed to compile module", error, NULL);
  wasm_byte_vec_delete(&wasm);