properly splatting bytes in emit_small_memset

2021-05-13 22:02:17 +03:00
parent 1425c1e7bf 6fb2a24c6b
commit 38140900f1
614 changed files with 40658 additions and 7141 deletions
--- a/.github/actions/install-rust/main.js
+++ b/.github/actions/install-rust/main.js
@@ -30,3 +30,8 @@ set_env("CARGO_INCREMENTAL", "0");
 // Turn down debuginfo from 2 to 1 to help save disk space
 set_env("CARGO_PROFILE_DEV_DEBUG", "1");
 set_env("CARGO_PROFILE_TEST_DEBUG", "1");
+
+if (process.platform === 'darwin') {
+  set_env("CARGO_PROFILE_DEV_SPLIT_DEBUGINFO", "unpacked");
+  set_env("CARGO_PROFILE_TEST_SPLIT_DEBUGINFO", "unpacked");
+}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -48,6 +48,7 @@ jobs:
    - uses: actions/checkout@v2
      with:
        submodules: true
+    - run: rustup update stable && rustup default stable
    - run: |
        set -e
        curl -L https://github.com/rust-lang-nursery/mdBook/releases/download/v0.4.4/mdbook-v0.4.4-x86_64-unknown-linux-gnu.tar.gz | tar xzf -
@@ -73,12 +74,15 @@ jobs:
    - uses: actions/checkout@v2
      with:
        submodules: true
-    # Note that we use nightly Rust for the doc_cfg feature (enabled via `nightlydoc` above)
-    # This version is an older nightly for the new x64 backend (see below)
    - uses: ./.github/actions/install-rust
      with:
-        toolchain: nightly-2020-12-26
-    - run: cargo doc --no-deps --all --exclude wasmtime-cli --exclude test-programs --exclude cranelift-codegen-meta
+        toolchain: nightly-2021-04-11
+    - run: |
+        cargo doc --no-deps --workspace \
+          --exclude wasmtime-cli \
+          --exclude test-programs \
+          --exclude cranelift-codegen-meta \
+          --exclude 'peepmatic*'
    - run: cargo doc --package cranelift-codegen-meta --document-private-items
    - uses: actions/upload-artifact@v1
      with:
@@ -165,7 +169,7 @@ jobs:
    # flags to rustc.
    - uses: ./.github/actions/install-rust
      with:
-        toolchain: nightly
+        toolchain: nightly-2021-04-11
    - run: cargo install cargo-fuzz --vers "^0.8"
    - run: cargo fetch
      working-directory: ./fuzz
@@ -178,16 +182,9 @@ jobs:
    - uses: actions/checkout@v2
      with:
        submodules: true
+    - run: rustup update stable && rustup default stable
    - name: Test `peepmatic`
-      run: |
-        cargo test \
-          --package peepmatic \
-          --package peepmatic-automata \
-          --package peepmatic-fuzzing \
-          --package peepmatic-macro \
-          --package peepmatic-runtime \
-          --package peepmatic-test \
-          --package peepmatic-souper
+      run: cargo test --package 'peepmatic*'
    - name: Rebuild Peepmatic-based peephole optimizers
      run: |
        cargo test \
@@ -211,6 +208,7 @@ jobs:
    name: Test
    runs-on: ${{ matrix.os }}
    strategy:
+      fail-fast: false
      matrix:
        build: [stable, beta, nightly, windows, macos]
        include:
@@ -222,7 +220,7 @@ jobs:
            rust: beta
          - build: nightly
            os: ubuntu-latest
-            rust: nightly
+            rust: nightly-2021-04-11
          - build: macos
            os: macos-latest
            rust: stable
@@ -270,18 +268,10 @@ jobs:
    - run: |
        cargo test \
            --features test-programs/test_programs \
-            --all \
-            --exclude lightbeam \
-            --exclude wasmtime-lightbeam \
-            --exclude wasmtime-wasi-nn \
-            --exclude wasmtime-wasi-crypto \
-            --exclude peepmatic \
-            --exclude peepmatic-automata \
-            --exclude peepmatic-fuzzing \
-            --exclude peepmatic-macro \
-            --exclude peepmatic-runtime \
-            --exclude peepmatic-test \
-            --exclude peepmatic-souper
+            --workspace \
+            --exclude '*lightbeam*' \
+            --exclude 'wasmtime-wasi-*' \
+            --exclude 'peepmatic*'
      env:
        RUST_BACKTRACE: 1

@@ -297,7 +287,7 @@ jobs:
    # Test debug (DWARF) related functionality on new backend.
    - run: |
        sudo apt-get update && sudo apt-get install -y gdb lldb
-        cargo test --features experimental_x64 test_debug_dwarf -- --ignored --test-threads 1 --test debug::
+        cargo test test_debug_dwarf -- --ignored --test-threads 1 --test debug::
      if: matrix.os == 'ubuntu-latest'
      env:
        RUST_BACKTRACE: 1
@@ -320,13 +310,9 @@ jobs:
      env:
        RUST_BACKTRACE: 1

-  # Perform all tests (debug mode) for `wasmtime` with the experimental x64
-  # backend. This runs on an older nightly of Rust (because of issues with
-  # unifying Cargo features on stable) on Ubuntu such that it's new enough
-  # to build Wasmtime, but old enough where the -Z options being used
-  # haven't been stabilized yet.
+  # Perform all tests (debug mode) for `wasmtime` with the old x86 backend.
  test_x64:
-    name: Test x64 new backend
+    name: Test old x86 backend
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v2
@@ -334,7 +320,7 @@ jobs:
        submodules: true
    - uses: ./.github/actions/install-rust
      with:
-        toolchain: nightly-2020-12-26
+        toolchain: stable
    - uses: ./.github/actions/define-llvm-env

    # Install wasm32 targets in order to build various tests throughout the
@@ -342,43 +328,9 @@ jobs:
    - run: rustup target add wasm32-wasi
    - run: rustup target add wasm32-unknown-unknown

-    # Run the x64 CI script.
-    - run: ./ci/run-experimental-x64-ci.sh
+    # Run the old x86 backend CI (we will eventually remove this).
+    - run: ./ci/run-old-x86-ci.sh
      env:
-        CARGO_VERSION: "+nightly-2020-12-26"
-        RUST_BACKTRACE: 1
-
-  # Perform tests on the new x64 backend on Windows as well.
-  test_x64_win:
-    name: Test x64 new backend on Windows
-    runs-on: windows-latest
-    steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: true
-    - uses: ./.github/actions/install-rust
-      with:
-        toolchain: nightly-2020-11-29
-    - uses: ./.github/actions/define-llvm-env
-
-    - name: Install libclang
-      # Note: libclang is pre-installed on the macOS and linux images.
-      if: matrix.os == 'windows-latest'
-      run: |
-        curl https://releases.llvm.org/9.0.0/LLVM-9.0.0-win64.exe -o llvm-installer.exe
-        7z x llvm-installer.exe -oC:/llvm-binary
-        echo LIBCLANG_PATH=C:/llvm-binary/bin/libclang.dll >> $GITHUB_ENV
-        echo C:/llvm-binary/bin >> $GITHUB_PATH
-
-    # Install wasm32 targets in order to build various tests throughout the
-    # repo.
-    - run: rustup target add wasm32-wasi
-    - run: rustup target add wasm32-unknown-unknown
-
-    # Run the x64 CI script.
-    - run: ./ci/run-experimental-x64-ci.sh
-      env:
-        CARGO_VERSION: "+nightly-2020-11-29"
        RUST_BACKTRACE: 1

  # Build and test the wasi-nn module.
@@ -390,8 +342,6 @@ jobs:
        with:
          submodules: true
      - uses: ./.github/actions/install-rust
-        with:
-          toolchain: nightly
      - run: rustup target add wasm32-wasi
      - uses: ./.github/actions/install-openvino
      - run: ./ci/run-wasi-nn-example.sh
@@ -433,6 +383,7 @@ jobs:
    name: Build wasmtime
    runs-on: ${{ matrix.os }}
    strategy:
+      fail-fast: false
      matrix:
        include:
        - build: x86_64-linux
@@ -517,18 +468,10 @@ jobs:
        $CENTOS cargo test \
            --features test-programs/test_programs \
            --release \
-            --all \
-            --exclude lightbeam \
-            --exclude wasmtime-lightbeam \
-            --exclude wasmtime-wasi-nn \
-            --exclude wasmtime-wasi-crypto \
-            --exclude peepmatic \
-            --exclude peepmatic-automata \
-            --exclude peepmatic-fuzzing \
-            --exclude peepmatic-macro \
-            --exclude peepmatic-runtime \
-            --exclude peepmatic-test \
-            --exclude peepmatic-souper \
+            --workspace \
+            --exclude '*lightbeam*' \
+            --exclude 'wasmtime-wasi-*' \
+            --exclude 'peepmatic*' \
            --exclude wasmtime-fuzz
      env:
        RUST_BACKTRACE: 1
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "wasmtime-cli"
-version = "0.25.0"
+version = "0.26.0"
 authors = ["The Wasmtime Project Developers"]
 description = "Command-line interface for Wasmtime"
 license = "Apache-2.0 WITH LLVM-exception"
@@ -22,31 +22,29 @@ doc = false

 [dependencies]
 # Enable all supported architectures by default.
-wasmtime = { path = "crates/wasmtime", version = "0.25.0", default-features = false, features = ['cache'] }
-wasmtime-cache = { path = "crates/cache", version = "0.25.0" }
-wasmtime-debug = { path = "crates/debug", version = "0.25.0" }
-wasmtime-environ = { path = "crates/environ", version = "0.25.0" }
-wasmtime-jit = { path = "crates/jit", version = "0.25.0" }
-wasmtime-obj = { path = "crates/obj", version = "0.25.0" }
-wasmtime-wast = { path = "crates/wast", version = "0.25.0" }
-wasmtime-wasi = { path = "crates/wasi", version = "0.25.0" }
-wasmtime-wasi-crypto = { path = "crates/wasi-crypto", version = "0.25.0", optional = true }
-wasmtime-wasi-nn = { path = "crates/wasi-nn", version = "0.25.0", optional = true }
-wasi-common = { path = "crates/wasi-common", version = "0.25.0" }
-wasi-cap-std-sync = { path = "crates/wasi-common/cap-std-sync", version = "0.25.0" }
+wasmtime = { path = "crates/wasmtime", version = "0.26.0", default-features = false, features = ['cache'] }
+wasmtime-cache = { path = "crates/cache", version = "0.26.0" }
+wasmtime-debug = { path = "crates/debug", version = "0.26.0" }
+wasmtime-environ = { path = "crates/environ", version = "0.26.0" }
+wasmtime-jit = { path = "crates/jit", version = "0.26.0" }
+wasmtime-obj = { path = "crates/obj", version = "0.26.0" }
+wasmtime-wast = { path = "crates/wast", version = "0.26.0" }
+wasmtime-wasi = { path = "crates/wasi", version = "0.26.0" }
+wasmtime-wasi-crypto = { path = "crates/wasi-crypto", version = "0.26.0", optional = true }
+wasmtime-wasi-nn = { path = "crates/wasi-nn", version = "0.26.0", optional = true }
 structopt = { version = "0.3.5", features = ["color", "suggestions"] }
-object = { version = "0.23.0", default-features = false, features = ["write"] }
+object = { version = "0.24.0", default-features = false, features = ["write"] }
 anyhow = "1.0.19"
-target-lexicon = { version = "0.11.0", default-features = false }
+target-lexicon = { version = "0.12.0", default-features = false }
 pretty_env_logger = "0.4.0"
 file-per-thread-logger = "0.1.1"
-wat = "1.0.36"
+wat = "1.0.37"
 libc = "0.2.60"
 log = "0.4.8"
 rayon = "1.2.1"
 humantime = "2.0.0"
-wasmparser = "0.76.0"
-cap-std = "0.13"
+wasmparser = "0.77.0"
+lazy_static = "1.4.0"

 [dev-dependencies]
 env_logger = "0.8.1"
@@ -56,6 +54,7 @@ tempfile = "3.1.0"
 test-programs = { path = "crates/test-programs" }
 wasmtime-fuzzing = { path = "crates/fuzzing" }
 wasmtime-runtime = { path = "crates/runtime" }
+tokio = { version = "1.5.0", features = ["rt", "time", "macros", "rt-multi-thread"] }
 tracing-subscriber = "0.2.16"
 wast = "35.0.0"

@@ -66,6 +65,7 @@ anyhow = "1.0.19"
 opt-level = 0

 [workspace]
+resolver = '2'
 members = [
  "cranelift",
  "crates/bench-api",
@@ -79,23 +79,29 @@ members = [
  "crates/wiggle/wasmtime",
  "crates/wasi-common",
  "crates/wasi-common/cap-std-sync",
+  "crates/wasi-common/tokio",
  "examples/fib-debug/wasm",
  "examples/wasi/wasm",
+  "examples/tokio/wasm",
  "fuzz",
 ]

 [features]
-default = ["jitdump", "wasmtime/wat", "wasmtime/parallel-compilation"]
+default = ["jitdump", "wasmtime/wat", "wasmtime/parallel-compilation", "wasi-nn"]
 lightbeam = ["wasmtime/lightbeam"]
 jitdump = ["wasmtime/jitdump"]
 vtune = ["wasmtime/vtune"]
 wasi-crypto = ["wasmtime-wasi-crypto"]
 wasi-nn = ["wasmtime-wasi-nn"]
 uffd = ["wasmtime/uffd"]
+all-arch = ["wasmtime/all-arch"]

-# Try the experimental, work-in-progress new x86_64 backend. This is not stable
-# as of June 2020.
-experimental_x64 = ["wasmtime-jit/experimental_x64"]
+# Stub feature that does nothing, for Cargo-features compatibility: the new
+# backend is the default now.
+experimental_x64 = []
+
+# Use the old x86 backend.
+old-x86-backend = ["wasmtime/old-x86-backend"]

 [badges]
 maintenance = { status = "actively-developed" }
@@ -104,5 +110,9 @@ maintenance = { status = "actively-developed" }
 name = "host_segfault"
 harness = false

+[[example]]
+name = "tokio"
+required-features = ["wasmtime-wasi/tokio"]
+
 [profile.dev.package.backtrace]
 debug = false # FIXME(#1813)
--- a/RELEASES.md
+++ b/RELEASES.md
@@ -2,6 +2,137 @@

 --------------------------------------------------------------------------------

+## Unreleased
+
+### Added
+
+* Added `Store::with_limits`, `StoreLimits`, and `ResourceLimiter` to the
+  Wasmtime API to help with enforcing resource limits at runtime. The
+  `ResourceLimiter` trait can be implemented by custom resource limiters to
+  decide if linear memories or tables can be grown.
+
+### Changed
+
+* Breaking: `Memory::new` has been changed to return `Result` as creating a
+  host memory object is now a fallible operation when the initial size of
+  the memory exceeds the store limits.
+
+## 0.26.0
+
+Released 2021-04-05.
+
+### Added
+
+* Added the `wasmtime compile` command to support AOT compilation of Wasm
+  modules. This adds the `Engine::precompile_module` method. Also added the
+  `Config::target` method to change the compilation target of the
+  configuration. This can be used in conjunction with
+  `Engine::precompile_module` to target a different host triple than the
+  current one.
+  [#2791](https://github.com/bytecodealliance/wasmtime/pull/2791)
+
+* Support for macOS on aarch64 (Apple M1 Silicon), including Apple-specific
+  calling convention details and unwinding/exception handling using Mach ports.
+  [#2742](https://github.com/bytecodealliance/wasmtime/pull/2742),
+  [#2723](https://github.com/bytecodealliance/wasmtime/pull/2723)
+
+* A number of SIMD instruction implementations in the new x86-64 backend.
+  [#2771](https://github.com/bytecodealliance/wasmtime/pull/2771)
+
+* Added the `Config::cranelift_flag_enable` method to enable setting Cranelift
+  boolean flags or presets in a config.
+
+* Added CLI option `--cranelift-enable` to enable boolean settings and ISA presets.
+
+* Deduplicate function signatures in Wasm modules.
+  [#2772](https://github.com/bytecodealliance/wasmtime/pull/2772)
+
+* Optimize overheads of calling into Wasm functions.
+  [#2757](https://github.com/bytecodealliance/wasmtime/pull/2757),
+  [#2759](https://github.com/bytecodealliance/wasmtime/pull/2759)
+
+* Improvements related to Module Linking: compile fewer trampolines; 
+
+  [#2774](https://github.com/bytecodealliance/wasmtime/pull/2774)
+
+* Re-export sibling crates from `wasmtime-wasi` to make embedding easier
+  without needing to match crate versions.
+  [#2776](https://github.com/bytecodealliance/wasmtime/pull/2776)
+
+### Changed
+
+* Switched the default compiler backend on x86-64 to Cranelift's new backend.
+  This should not have any user-visible effects other than possibly runtime
+  performance improvements. The old backend is still available with the
+  `old-x86-backend` feature flag to the `cranelift-codegen` or `wasmtime`
+  crates, or programmatically with `BackendVariant::Legacy`. We plan to
+  maintain the old backend for at least one more release and ensure it works on
+  CI.
+  [#2718](https://github.com/bytecodealliance/wasmtime/pull/2718)
+
+* Breaking: `Module::deserialize` has been removed in favor of `Module::new`.
+
+* Breaking: `Config::cranelift_clear_cpu_flags` was removed. Use `Config::target`
+  to clear the CPU flags for the host's target.
+
+* Breaking: `Config::cranelift_other_flag` was renamed to `Config::cranelift_flag_set`.
+
+* CLI changes:
+  * Wasmtime CLI options to enable WebAssembly features have been replaced with
+    a singular `--wasm-features` option. The previous options are still
+    supported, but are not displayed in help text.
+  * Breaking: the CLI option `--cranelift-flags` was changed to
+    `--cranelift-set`.
+  * Breaking: the CLI option `--enable-reference-types=false` has been changed
+    to `--wasm-features=-reference-types`.
+  * Breaking: the CLI option `--enable-multi-value=false` has been changed to
+    `--wasm-features=-multi-value`.
+  * Breaking: the CLI option `--enable-bulk-memory=false` has been changed to
+    `--wasm-features=-bulk-memory`.
+
+* Improved error-reporting in wiggle.
+  [#2760](https://github.com/bytecodealliance/wasmtime/pull/2760)
+
+* Make WASI sleeping fallible (some systems do not support sleep).
+  [#2756](https://github.com/bytecodealliance/wasmtime/pull/2756)
+
+* WASI: Support `poll_oneoff` with a sleep.
+  [#2753](https://github.com/bytecodealliance/wasmtime/pull/2753)
+
+* Allow a `StackMapSink` to be passed when defining functions with
+  `cranelift-module`.
+  [#2739](https://github.com/bytecodealliance/wasmtime/pull/2739)
+
+* Some refactoring in new x86-64 backend to prepare for VEX/EVEX (e.g.,
+  AVX-512) instruction encodings to be supported.
+  [#2799](https://github.com/bytecodealliance/wasmtime/pull/2799)
+
+### Fixed
+
+* Fixed a corner case in `srem` (signed remainder) in the new x86-64 backend:
+  `INT_MIN % -1` should return `0`, rather than trapping. This only occurred
+  when `avoid_div_traps == false` was set by the embedding.
+  [#2763](https://github.com/bytecodealliance/wasmtime/pull/2763)
+
+* Fixed a memory leak of the `Store` when an instance traps.
+  [#2803](https://github.com/bytecodealliance/wasmtime/pull/2803)
+
+* Some fuzzing-related fixes.
+  [#2788](https://github.com/bytecodealliance/wasmtime/pull/2788),
+  [#2770](https://github.com/bytecodealliance/wasmtime/pull/2770)
+
+* Fixed memory-initialization bug in uffd allocator that could copy into the
+  wrong destination under certain conditions. Does not affect the default
+  wasmtime instance allocator.
+  [#2801](https://github.com/bytecodealliance/wasmtime/pull/2801)
+
+* Fix printing of float values from the Wasmtime CLI.
+  [#2797](https://github.com/bytecodealliance/wasmtime/pull/2797)
+
+* Remove the ability for the `Linker` to instantiate modules with duplicate
+  import strings of different types.
+  [#2789](https://github.com/bytecodealliance/wasmtime/pull/2789)
+
 ## 0.25.0

 Released 2021-03-16.
@@ -39,7 +170,7 @@ Released 2021-03-16.

 ### Fixed

-* Interepretation of timestamps in `poll_oneoff` for WASI have been fixed to
+* Interpretation of timestamps in `poll_oneoff` for WASI have been fixed to
  correctly use nanoseconds instead of microseconds.
  [#2717](https://github.com/bytecodealliance/wasmtime/pull/2717)

--- a/build.rs
+++ b/build.rs
@@ -155,11 +155,8 @@ fn write_testsuite_tests(
    let testname = extract_name(path);

    writeln!(out, "#[test]")?;
-    if experimental_x64_should_panic(testsuite, &testname, strategy) {
-        writeln!(
-            out,
-            r#"#[cfg_attr(feature = "experimental_x64", should_panic)]"#
-        )?;
+    if x64_should_panic(testsuite, &testname, strategy) {
+        writeln!(out, r#"#[should_panic]"#)?;
    } else if ignore(testsuite, &testname, strategy) {
        writeln!(out, "#[ignore]")?;
    } else if pooling {
@@ -186,10 +183,10 @@ fn write_testsuite_tests(
    Ok(())
 }

-/// For experimental_x64 backend features that are not supported yet, mark tests as panicking, so
+/// For x64 backend features that are not supported yet, mark tests as panicking, so
 /// they stop "passing" once the features are properly implemented.
-fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool {
-    if !cfg!(feature = "experimental_x64") || strategy != "Cranelift" {
+fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool {
+    if !platform_is_x64() || strategy != "Cranelift" {
        return false;
    }

@@ -222,12 +219,10 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
            _ => (),
        },
        "Cranelift" => match (testsuite, testname) {
-            // TODO(#1886): Ignore reference types tests if this isn't x64,
-            // because Cranelift only supports reference types on x64.
-            ("reference_types", _) => {
-                return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64";
-            }
+            // No simd support yet for s390x.
+            ("simd", _) if platform_is_s390x() => return true,

+            ("simd", _) if cfg!(feature = "old-x86-backend") => return true, // skip all SIMD tests on old backend.
            // These are new instructions that are not really implemented in any backend.
            ("simd", "simd_i8x16_arith2")
            | ("simd", "simd_conversions")
@@ -240,22 +235,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
            | ("simd", "simd_i64x2_extmul_i32x4")
            | ("simd", "simd_int_to_int_extend") => return true,

-            // These are only implemented on x64.
-            ("simd", "simd_i64x2_arith2") | ("simd", "simd_boolean") => {
-                return !cfg!(feature = "experimental_x64")
-            }
-
-            // These are only implemented on aarch64 and x64.
-            ("simd", "simd_i64x2_cmp")
-            | ("simd", "simd_f32x4_pmin_pmax")
-            | ("simd", "simd_f64x2_pmin_pmax")
-            | ("simd", "simd_f32x4_rounding")
-            | ("simd", "simd_f64x2_rounding")
-            | ("simd", "simd_i32x4_dot_i16x8") => {
-                return !(cfg!(feature = "experimental_x64")
-                    || env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "aarch64")
-            }
-
            _ => {}
        },
        _ => panic!("unrecognized strategy"),
@@ -263,3 +242,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {

    false
 }
+
+fn platform_is_x64() -> bool {
+    env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "x86_64"
+}
+
+fn platform_is_s390x() -> bool {
+    env::var("CARGO_CFG_TARGET_ARCH").unwrap() == "s390x"
+}
--- a/ci/run-experimental-x64-ci.sh
+++ b/ci/run-experimental-x64-ci.sh
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-# Use the Nightly variant of the compiler to properly unify the
-# experimental_x64 feature across all crates.  Once the feature has stabilized
-# and become the default, we can remove this.
-CARGO_VERSION=${CARGO_VERSION:-"+nightly"}
-
-# Some WASI tests seem to have an issue on Windows with symlinks if we run them
-# with this particular invocation. It's unclear why (nightly toolchain?) but
-# we're moving to the new backend by default soon enough, and all tests seem to
-# work with the main test setup, so let's just work around this by skipping
-# the tests for now.
-MINGW_EXTRA=""
-if [ `uname -o` == "Msys" ]; then
-	MINGW_EXTRA="-- --skip wasi_cap_std_sync"
-fi
-
-cargo $CARGO_VERSION \
-            --locked \
-            -Zfeatures=all -Zpackage-features \
-            test \
-            --features test-programs/test_programs \
-            --features experimental_x64 \
-            --all \
-            --exclude wasmtime-lightbeam \
-            --exclude wasmtime-wasi-nn \
-            --exclude wasmtime-wasi-crypto \
-            --exclude peepmatic \
-            --exclude peepmatic-automata \
-            --exclude peepmatic-fuzzing \
-            --exclude peepmatic-macro \
-            --exclude peepmatic-runtime \
-            --exclude peepmatic-test \
-            --exclude peepmatic-souper \
-            --exclude lightbeam \
-	    $MINGW_EXTRA
--- a/ci/run-old-x86-ci.sh
+++ b/ci/run-old-x86-ci.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+cargo test \
+            --locked \
+            --features test-programs/test_programs \
+            --features old-x86-backend \
+            --all \
+            --exclude wasmtime-lightbeam \
+            --exclude wasmtime-wasi-nn \
+            --exclude wasmtime-wasi-crypto \
+            --exclude peepmatic \
+            --exclude peepmatic-automata \
+            --exclude peepmatic-fuzzing \
+            --exclude peepmatic-macro \
+            --exclude peepmatic-runtime \
+            --exclude peepmatic-test \
+            --exclude peepmatic-souper \
+            --exclude lightbeam
--- a/ci/run-wasi-crypto-example.sh
+++ b/ci/run-wasi-crypto-example.sh
@@ -7,4 +7,4 @@ pushd "$RUST_BINDINGS"
 cargo build --release --target=wasm32-wasi
 popd

-cargo run --features wasi-crypto -- run "$RUST_BINDINGS/target/wasm32-wasi/release/wasi-crypto-guest.wasm"
+cargo run --features wasi-crypto -- run "$RUST_BINDINGS/target/wasm32-wasi/release/wasi-crypto-guest.wasm" --wasi-modules=experimental-wasi-crypto
--- a/ci/run-wasi-nn-example.sh
+++ b/ci/run-wasi-nn-example.sh
@@ -7,7 +7,7 @@
 # executed with the Wasmtime CLI.
 set -e
 WASMTIME_DIR=$(dirname "$0" | xargs dirname)
-FIXTURE=https://github.com/intel/openvino-rs/raw/main/crates/openvino/tests/fixtures/alexnet
+FIXTURE=https://github.com/intel/openvino-rs/raw/main/crates/openvino/tests/fixtures/mobilenet
 if [ -z "${1+x}" ]; then
    # If no temporary directory is specified, create one.
    TMP_DIR=$(mktemp -d -t ci-XXXXXXXXXX)
@@ -26,9 +26,9 @@ source /opt/intel/openvino/bin/setupvars.sh
 OPENVINO_INSTALL_DIR=/opt/intel/openvino cargo build -p wasmtime-cli --features wasi-nn

 # Download all necessary test fixtures to the temporary directory.
-wget --no-clobber --directory-prefix=$TMP_DIR $FIXTURE/alexnet.bin
-wget --no-clobber --directory-prefix=$TMP_DIR $FIXTURE/alexnet.xml
-wget --no-clobber --directory-prefix=$TMP_DIR $FIXTURE/tensor-1x3x227x227-f32.bgr
+wget --no-clobber $FIXTURE/mobilenet.bin --output-document=$TMP_DIR/model.bin
+wget --no-clobber $FIXTURE/mobilenet.xml --output-document=$TMP_DIR/model.xml
+wget --no-clobber $FIXTURE/tensor-1x224x224x3-f32.bgr --output-document=$TMP_DIR/tensor.bgr

 # Now build an example that uses the wasi-nn API.
 pushd $WASMTIME_DIR/crates/wasi-nn/examples/classification-example
@@ -37,7 +37,7 @@ cp target/wasm32-wasi/release/wasi-nn-example.wasm $TMP_DIR
 popd

 # Run the example in Wasmtime (note that the example uses `fixture` as the expected location of the model/tensor files).
-OPENVINO_INSTALL_DIR=/opt/intel/openvino cargo run --features wasi-nn -- run --mapdir fixture::$TMP_DIR $TMP_DIR/wasi-nn-example.wasm
+cargo run -- run --mapdir fixture::$TMP_DIR $TMP_DIR/wasi-nn-example.wasm --wasi-modules=experimental-wasi-nn

 # Clean up the temporary directory only if it was not specified (users may want to keep the directory around).
 if [[ $REMOVE_TMP_DIR -eq 1 ]]; then
--- a/cranelift/Cargo.toml
+++ b/cranelift/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "cranelift-tools"
 authors = ["The Cranelift Project Developers"]
-version = "0.66.0"
+version = "0.73.0"
 description = "Binaries for testing the Cranelift libraries"
 license = "Apache-2.0 WITH LLVM-exception"
 documentation = "https://github.com/bytecodealliance/wasmtime/blob/main/cranelift/docs/index.md"
@@ -15,27 +15,27 @@ path = "src/clif-util.rs"

 [dependencies]
 cfg-if = "1.0"
-cranelift-codegen = { path = "codegen", version = "0.72.0" }
-cranelift-entity = { path = "entity", version = "0.72.0" }
-cranelift-interpreter = { path = "interpreter", version = "0.72.0" }
-cranelift-reader = { path = "reader", version = "0.72.0" }
-cranelift-frontend = { path = "frontend", version = "0.72.0" }
-cranelift-serde = { path = "serde", version = "0.72.0", optional = true }
-cranelift-wasm = { path = "wasm", version = "0.72.0", optional = true }
-cranelift-native = { path = "native", version = "0.72.0" }
-cranelift-filetests = { path = "filetests", version = "0.66.0" }
-cranelift-module = { path = "module", version = "0.72.0" }
-cranelift-object = { path = "object", version = "0.72.0" }
-cranelift-jit = { path = "jit", version = "0.72.0" }
-cranelift-preopt = { path = "preopt", version = "0.72.0" }
-cranelift = { path = "umbrella", version = "0.72.0" }
+cranelift-codegen = { path = "codegen", version = "0.73.0" }
+cranelift-entity = { path = "entity", version = "0.73.0" }
+cranelift-interpreter = { path = "interpreter", version = "0.73.0" }
+cranelift-reader = { path = "reader", version = "0.73.0" }
+cranelift-frontend = { path = "frontend", version = "0.73.0" }
+cranelift-serde = { path = "serde", version = "0.73.0", optional = true }
+cranelift-wasm = { path = "wasm", version = "0.73.0", optional = true }
+cranelift-native = { path = "native", version = "0.73.0" }
+cranelift-filetests = { path = "filetests", version = "0.73.0" }
+cranelift-module = { path = "module", version = "0.73.0" }
+cranelift-object = { path = "object", version = "0.73.0" }
+cranelift-jit = { path = "jit", version = "0.73.0" }
+cranelift-preopt = { path = "preopt", version = "0.73.0" }
+cranelift = { path = "umbrella", version = "0.73.0" }
 filecheck = "0.5.0"
 log = "0.4.8"
 termcolor = "1.1.2"
 capstone = { version = "0.7.0", optional = true }
 wat = { version = "1.0.36", optional = true }
-target-lexicon = { version = "0.11", features = ["std"] }
-peepmatic-souper = { path = "./peepmatic/crates/souper", version = "0.72.0", optional = true }
+target-lexicon = { version = "0.12", features = ["std"] }
+peepmatic-souper = { path = "./peepmatic/crates/souper", version = "0.73.0", optional = true }
 pretty_env_logger = "0.4.0"
 rayon = { version = "1", optional = true }
 file-per-thread-logger = "0.1.2"
@@ -50,6 +50,6 @@ default = ["disas", "wasm", "cranelift-codegen/all-arch", "peepmatic-souper", "s
 disas = ["capstone"]
 enable-peepmatic = ["cranelift-codegen/enable-peepmatic", "cranelift-filetests/enable-peepmatic"]
 wasm = ["wat", "cranelift-wasm"]
-experimental_x64 = ["cranelift-codegen/x64", "cranelift-filetests/experimental_x64", "cranelift-reader/experimental_x64"]
 experimental_arm32 = ["cranelift-codegen/arm32", "cranelift-filetests/experimental_arm32"]
 souper-harvest = ["cranelift-codegen/souper-harvest", "rayon"]
+all-arch = ["cranelift-codegen/all-arch"]
--- a/cranelift/bforest/Cargo.toml
+++ b/cranelift/bforest/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 authors = ["The Cranelift Project Developers"]
 name = "cranelift-bforest"
-version = "0.72.0"
+version = "0.73.0"
 description = "A forest of B+-trees"
 license = "Apache-2.0 WITH LLVM-exception"
 documentation = "https://docs.rs/cranelift-bforest"
@@ -12,7 +12,7 @@ keywords = ["btree", "forest", "set", "map"]
 edition = "2018"

 [dependencies]
-cranelift-entity = { path = "../entity", version = "0.72.0", default-features = false }
+cranelift-entity = { path = "../entity", version = "0.73.0", default-features = false }

 [badges]
 maintenance = { status = "experimental" }
--- a/cranelift/codegen/Cargo.toml
+++ b/cranelift/codegen/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 authors = ["The Cranelift Project Developers"]
 name = "cranelift-codegen"
-version = "0.72.0"
+version = "0.73.0"
 description = "Low-level code generator library"
 license = "Apache-2.0 WITH LLVM-exception"
 documentation = "https://docs.rs/cranelift-codegen"
@@ -13,21 +13,19 @@ build = "build.rs"
 edition = "2018"

 [dependencies]
-cranelift-codegen-shared = { path = "./shared", version = "0.72.0" }
-cranelift-entity = { path = "../entity", version = "0.72.0" }
-cranelift-bforest = { path = "../bforest", version = "0.72.0" }
+cranelift-codegen-shared = { path = "./shared", version = "0.73.0" }
+cranelift-entity = { path = "../entity", version = "0.73.0" }
+cranelift-bforest = { path = "../bforest", version = "0.73.0" }
 hashbrown = { version = "0.9.1", optional = true }
-target-lexicon = "0.11"
+target-lexicon = "0.12"
 log = { version = "0.4.6", default-features = false }
 serde = { version = "1.0.94", features = ["derive"], optional = true }
 bincode = { version = "1.2.1", optional = true }
-gimli = { version = "0.23.0", default-features = false, features = ["write"], optional = true }
+gimli = { version = "0.24.0", default-features = false, features = ["write"], optional = true }
 smallvec = { version = "1.6.1" }
-thiserror = "1.0.4"
-byteorder = { version = "1.3.2", default-features = false }
-peepmatic = { path = "../peepmatic", optional = true, version = "0.72.0" }
-peepmatic-traits = { path = "../peepmatic/crates/traits", optional = true, version = "0.72.0" }
-peepmatic-runtime = { path = "../peepmatic/crates/runtime", optional = true, version = "0.72.0" }
+peepmatic = { path = "../peepmatic", optional = true, version = "0.73.0" }
+peepmatic-traits = { path = "../peepmatic/crates/traits", optional = true, version = "0.73.0" }
+peepmatic-runtime = { path = "../peepmatic/crates/runtime", optional = true, version = "0.73.0" }
 regalloc = { version = "0.0.31" }
 souper-ir = { version = "2.1.0", optional = true }
 wast = { version = "35.0.0", optional = true }
@@ -36,8 +34,11 @@ wast = { version = "35.0.0", optional = true }
 # machine code. Integration tests that need external dependencies can be
 # accomodated in `tests`.

+[dev-dependencies]
+criterion = "0.3"
+
 [build-dependencies]
-cranelift-codegen-meta = { path = "meta", version = "0.72.0" }
+cranelift-codegen-meta = { path = "meta", version = "0.73.0" }

 [features]
 default = ["std", "unwind"]
@@ -63,14 +64,22 @@ unwind = ["gimli"]
 x86 = []
 arm64 = []
 riscv = []
-x64 = [] # New work-in-progress codegen backend for x86_64 based on the new isel.
+s390x = []
 arm32 = [] # Work-in-progress codegen backend for ARM.

+# Stub feature that does nothing, for Cargo-features compatibility: the new
+# backend is the default now.
+experimental_x64 = []
+
+# Make the old x86 backend the default.
+old-x86-backend = []
+
 # Option to enable all architectures.
 all-arch = [
    "x86",
    "arm64",
-    "riscv"
+    "riscv",
+    "s390x"
 ]

 # For dependent crates that want to serialize some parts of cranelift
@@ -97,3 +106,7 @@ souper-harvest = ["souper-ir", "souper-ir/stringify"]

 [badges]
 maintenance = { status = "experimental" }
+
+[[bench]]
+name = "x64-evex-encoding"
+harness = false
--- a/cranelift/codegen/benches/x64-evex-encoding.rs
+++ b/cranelift/codegen/benches/x64-evex-encoding.rs
@@ -0,0 +1,138 @@
+//! Measure instruction encoding latency using various approaches; the
+//! benchmarking is feature-gated on `x86` since it only measures the encoding
+//! mechanism of that backend.
+
+#[cfg(feature = "x86")]
+mod x86 {
+    use cranelift_codegen::isa::x64::encoding::{
+        evex::{EvexContext, EvexInstruction, EvexMasking, EvexVectorLength, Register},
+        rex::OpcodeMap,
+        rex::{encode_modrm, LegacyPrefixes},
+        ByteSink,
+    };
+    use cranelift_codegen_shared::isa::x86::EncodingBits;
+    use criterion::{criterion_group, Criterion};
+
+    // Define the benchmarks.
+    fn x64_evex_encoding_benchmarks(c: &mut Criterion) {
+        let mut group = c.benchmark_group("x64 EVEX encoding");
+        let rax = Register::from(0);
+        let rdx = Register::from(2);
+
+        group.bench_function("EvexInstruction (builder pattern)", |b| {
+            let mut sink = vec![];
+            b.iter(|| {
+                sink.clear();
+                EvexInstruction::new()
+                    .prefix(LegacyPrefixes::_66)
+                    .map(OpcodeMap::_0F38)
+                    .w(true)
+                    .opcode(0x1F)
+                    .reg(rax)
+                    .rm(rdx)
+                    .length(EvexVectorLength::V128)
+                    .encode(&mut sink);
+            });
+        });
+
+        group.bench_function("encode_evex (function pattern)", |b| {
+            let mut sink = vec![];
+            let bits = EncodingBits::new(&[0x66, 0x0f, 0x38, 0x1f], 0, 1);
+            let vvvvv = Register::from(0);
+            b.iter(|| {
+                sink.clear();
+                encode_evex(
+                    bits,
+                    rax,
+                    vvvvv,
+                    rdx,
+                    EvexContext::Other {
+                        length: EvexVectorLength::V128,
+                    },
+                    EvexMasking::default(),
+                    &mut sink,
+                );
+            })
+        });
+    }
+    criterion_group!(benches, x64_evex_encoding_benchmarks);
+
+    /// Using an inner module to feature-gate the benchmarks means that we must
+    /// manually specify how to run the benchmarks (see `criterion_main!`).
+    pub fn run_benchmarks() {
+        criterion::__warn_about_html_reports_feature();
+        criterion::__warn_about_cargo_bench_support_feature();
+        benches();
+        Criterion::default().configure_from_args().final_summary();
+    }
+
+    /// From the legacy x86 backend: a mechanism for encoding an EVEX
+    /// instruction, including the prefixes, the instruction opcode, and the
+    /// ModRM byte. This EVEX encoding function only encodes the `reg` (operand
+    /// 1), `vvvv` (operand 2), `rm` (operand 3) form; other forms are possible
+    /// (see section 2.6.2, Intel Software Development Manual, volume 2A),
+    /// requiring refactoring of this function or separate functions for each
+    /// form (e.g. as for the REX prefix).
+    #[inline(always)]
+    pub fn encode_evex<CS: ByteSink + ?Sized>(
+        enc: EncodingBits,
+        reg: Register,
+        vvvvv: Register,
+        rm: Register,
+        context: EvexContext,
+        masking: EvexMasking,
+        sink: &mut CS,
+    ) {
+        let reg: u8 = reg.into();
+        let rm: u8 = rm.into();
+        let vvvvv: u8 = vvvvv.into();
+
+        // EVEX prefix.
+        sink.put1(0x62);
+
+        debug_assert!(enc.mm() < 0b100);
+        let mut p0 = enc.mm() & 0b11;
+        p0 |= evex2(rm, reg) << 4; // bits 3:2 are always unset
+        sink.put1(p0);
+
+        let mut p1 = enc.pp() | 0b100; // bit 2 is always set
+        p1 |= (!(vvvvv) & 0b1111) << 3;
+        p1 |= (enc.rex_w() & 0b1) << 7;
+        sink.put1(p1);
+
+        let mut p2 = masking.aaa_bits();
+        p2 |= (!(vvvvv >> 4) & 0b1) << 3;
+        p2 |= context.bits() << 4;
+        p2 |= masking.z_bit() << 7;
+        sink.put1(p2);
+
+        // Opcode.
+        sink.put1(enc.opcode_byte());
+
+        // ModR/M byte.
+        sink.put1(encode_modrm(3, reg & 7, rm & 7))
+    }
+
+    /// From the legacy x86 backend: encode the RXBR' bits of the EVEX P0 byte.
+    /// For an explanation of these bits, see section 2.6.1 in the Intel
+    /// Software Development Manual, volume 2A. These bits can be used by
+    /// different addressing modes (see section 2.6.2), requiring different
+    /// `vex*` functions than this one.
+    fn evex2(rm: u8, reg: u8) -> u8 {
+        let b = !(rm >> 3) & 1;
+        let x = !(rm >> 4) & 1;
+        let r = !(reg >> 3) & 1;
+        let r_ = !(reg >> 4) & 1;
+        0x00 | r_ | (b << 1) | (x << 2) | (r << 3)
+    }
+}
+
+fn main() {
+    #[cfg(feature = "x86")]
+    x86::run_benchmarks();
+
+    #[cfg(not(feature = "x86"))]
+    println!(
+        "Unable to run the x64-evex-encoding benchmark; the `x86` feature must be enabled in Cargo.",
+    );
+}
--- a/cranelift/codegen/meta/Cargo.toml
+++ b/cranelift/codegen/meta/Cargo.toml
@@ -1,19 +1,20 @@
 [package]
 name = "cranelift-codegen-meta"
 authors = ["The Cranelift Project Developers"]
-version = "0.72.0"
+version = "0.73.0"
 description = "Metaprogram for cranelift-codegen code generator library"
 license = "Apache-2.0 WITH LLVM-exception"
 repository = "https://github.com/bytecodealliance/wasmtime"
 readme = "README.md"
 edition = "2018"

-[package.metadata.docs.rs]
-rustdoc-args = [ "--document-private-items" ]
+# FIXME(rust-lang/cargo#9300): uncomment once that lands
+# [package.metadata.docs.rs]
+# rustdoc-args = [ "--document-private-items" ]

 [dependencies]
-cranelift-codegen-shared = { path = "../shared", version = "0.72.0" }
-cranelift-entity = { path = "../../entity", version = "0.72.0" }
+cranelift-codegen-shared = { path = "../shared", version = "0.73.0" }
+cranelift-entity = { path = "../../entity", version = "0.73.0" }

 [badges]
 maintenance = { status = "experimental" }
--- a/cranelift/codegen/meta/src/cdsl/settings.rs
+++ b/cranelift/codegen/meta/src/cdsl/settings.rs
@@ -20,6 +20,7 @@ pub(crate) enum SpecificSetting {
 #[derive(Hash, PartialEq, Eq)]
 pub(crate) struct Setting {
    pub name: &'static str,
+    pub description: &'static str,
    pub comment: &'static str,
    pub specific: SpecificSetting,
    pub byte_offset: u8,
@@ -88,6 +89,7 @@ impl Into<PresetType> for PresetIndex {
 #[derive(Hash, PartialEq, Eq)]
 pub(crate) struct Preset {
    pub name: &'static str,
+    pub description: &'static str,
    values: Vec<BoolSettingIndex>,
 }

@@ -169,6 +171,7 @@ pub(crate) enum ProtoSpecificSetting {
 /// This is the information provided during building for a setting.
 struct ProtoSetting {
    name: &'static str,
+    description: &'static str,
    comment: &'static str,
    specific: ProtoSpecificSetting,
 }
@@ -251,11 +254,13 @@ impl SettingGroupBuilder {
    fn add_setting(
        &mut self,
        name: &'static str,
+        description: &'static str,
        comment: &'static str,
        specific: ProtoSpecificSetting,
    ) {
        self.settings.push(ProtoSetting {
            name,
+            description,
            comment,
            specific,
        })
@@ -264,6 +269,7 @@ impl SettingGroupBuilder {
    pub fn add_bool(
        &mut self,
        name: &'static str,
+        description: &'static str,
        comment: &'static str,
        default: bool,
    ) -> BoolSettingIndex {
@@ -271,28 +277,55 @@ impl SettingGroupBuilder {
            self.predicates.is_empty(),
            "predicates must be added after the boolean settings"
        );
-        self.add_setting(name, comment, ProtoSpecificSetting::Bool(default));
+        self.add_setting(
+            name,
+            description,
+            comment,
+            ProtoSpecificSetting::Bool(default),
+        );
        BoolSettingIndex(self.settings.len() - 1)
    }

    pub fn add_enum(
        &mut self,
        name: &'static str,
+        description: &'static str,
        comment: &'static str,
        values: Vec<&'static str>,
    ) {
-        self.add_setting(name, comment, ProtoSpecificSetting::Enum(values));
+        self.add_setting(
+            name,
+            description,
+            comment,
+            ProtoSpecificSetting::Enum(values),
+        );
    }

-    pub fn add_num(&mut self, name: &'static str, comment: &'static str, default: u8) {
-        self.add_setting(name, comment, ProtoSpecificSetting::Num(default));
+    pub fn add_num(
+        &mut self,
+        name: &'static str,
+        description: &'static str,
+        comment: &'static str,
+        default: u8,
+    ) {
+        self.add_setting(
+            name,
+            description,
+            comment,
+            ProtoSpecificSetting::Num(default),
+        );
    }

    pub fn add_predicate(&mut self, name: &'static str, node: PredicateNode) {
        self.predicates.push(ProtoPredicate { name, node });
    }

-    pub fn add_preset(&mut self, name: &'static str, args: Vec<PresetType>) -> PresetIndex {
+    pub fn add_preset(
+        &mut self,
+        name: &'static str,
+        description: &'static str,
+        args: Vec<PresetType>,
+    ) -> PresetIndex {
        let mut values = Vec::new();
        for arg in args {
            match arg {
@@ -302,7 +335,11 @@ impl SettingGroupBuilder {
                PresetType::BoolSetting(index) => values.push(index),
            }
        }
-        self.presets.push(Preset { name, values });
+        self.presets.push(Preset {
+            name,
+            description,
+            values,
+        });
        PresetIndex(self.presets.len() - 1)
    }

@@ -347,6 +384,7 @@ impl SettingGroupBuilder {

            group.settings.push(Setting {
                name: s.name,
+                description: s.description,
                comment: s.comment,
                byte_offset,
                specific,
@@ -367,6 +405,7 @@ impl SettingGroupBuilder {
            };
            group.settings.push(Setting {
                name: s.name,
+                description: s.description,
                comment: s.comment,
                byte_offset: byte_offset + predicate_number / 8,
                specific: SpecificSetting::Bool(BoolSetting {
--- a/cranelift/codegen/meta/src/gen_settings.rs
+++ b/cranelift/codegen/meta/src/gen_settings.rs
@@ -70,6 +70,33 @@ fn gen_constructor(group: &SettingGroup, parent: ParentGroup, fmt: &mut Formatte
    fmtln!(fmt, "}");
 }

+/// Generates the `iter` function.
+fn gen_iterator(group: &SettingGroup, fmt: &mut Formatter) {
+    fmtln!(fmt, "impl Flags {");
+    fmt.indent(|fmt| {
+        fmt.doc_comment("Iterates the setting values.");
+        fmtln!(fmt, "pub fn iter(&self) -> impl Iterator<Item = Value> {");
+        fmt.indent(|fmt| {
+            fmtln!(fmt, "let mut bytes = [0; {}];", group.settings_size);
+            fmtln!(fmt, "bytes.copy_from_slice(&self.bytes[0..{}]);", group.settings_size);
+            fmtln!(fmt, "DESCRIPTORS.iter().filter_map(move |d| {");
+            fmt.indent(|fmt| {
+                fmtln!(fmt, "let values = match &d.detail {");
+                fmt.indent(|fmt| {
+                    fmtln!(fmt, "detail::Detail::Preset => return None,");
+                    fmtln!(fmt, "detail::Detail::Enum { last, enumerators } => Some(TEMPLATE.enums(*last, *enumerators)),");
+                    fmtln!(fmt, "_ => None");
+                });
+                fmtln!(fmt, "};");
+                fmtln!(fmt, "Some(Value{ name: d.name, detail: d.detail, values, value: bytes[d.offset as usize] })");
+            });
+            fmtln!(fmt, "})");
+        });
+        fmtln!(fmt, "}");
+    });
+    fmtln!(fmt, "}");
+}
+
 /// Emit Display and FromStr implementations for enum settings.
 fn gen_to_and_from_str(name: &str, values: &[&'static str], fmt: &mut Formatter) {
    fmtln!(fmt, "impl fmt::Display for {} {{", name);
@@ -136,7 +163,7 @@ fn gen_enum_types(group: &SettingGroup, fmt: &mut Formatter) {

 /// Emit a getter function for `setting`.
 fn gen_getter(setting: &Setting, fmt: &mut Formatter) {
-    fmt.doc_comment(setting.comment);
+    fmt.doc_comment(format!("{}\n{}", setting.description, setting.comment));
    match setting.specific {
        SpecificSetting::Bool(BoolSetting {
            predicate_number, ..
@@ -254,6 +281,7 @@ fn gen_descriptors(group: &SettingGroup, fmt: &mut Formatter) {
            fmtln!(fmt, "detail::Descriptor {");
            fmt.indent(|fmt| {
                fmtln!(fmt, "name: \"{}\",", setting.name);
+                fmtln!(fmt, "description: \"{}\",", setting.description);
                fmtln!(fmt, "offset: {},", setting.byte_offset);
                match setting.specific {
                    SpecificSetting::Bool(BoolSetting { bit_offset, .. }) => {
@@ -286,6 +314,7 @@ fn gen_descriptors(group: &SettingGroup, fmt: &mut Formatter) {
            fmtln!(fmt, "detail::Descriptor {");
            fmt.indent(|fmt| {
                fmtln!(fmt, "name: \"{}\",", preset.name);
+                fmtln!(fmt, "description: \"{}\",", preset.description);
                fmtln!(fmt, "offset: {},", (idx as u8) * group.settings_size);
                fmtln!(fmt, "detail: detail::Detail::Preset,");
            });
@@ -427,6 +456,7 @@ fn gen_group(group: &SettingGroup, parent: ParentGroup, fmt: &mut Formatter) {
    fmtln!(fmt, "}");

    gen_constructor(group, parent, fmt);
+    gen_iterator(group, fmt);
    gen_enum_types(group, fmt);
    gen_getters(group, fmt);
    gen_descriptors(group, fmt);
--- a/cranelift/codegen/meta/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/meta/src/isa/arm64/mod.rs
@@ -9,7 +9,7 @@ use crate::shared::Definitions as SharedDefinitions;

 fn define_settings(_shared: &SettingGroup) -> SettingGroup {
    let mut setting = SettingGroupBuilder::new("arm64");
-    let has_lse = setting.add_bool("has_lse", "Large System Extensions", false);
+    let has_lse = setting.add_bool("has_lse", "Has Large System Extensions support.", "", false);

    setting.add_predicate("use_lse", predicate!(has_lse));
    setting.build()
--- a/cranelift/codegen/meta/src/isa/mod.rs
+++ b/cranelift/codegen/meta/src/isa/mod.rs
@@ -6,6 +6,7 @@ use std::fmt;
 mod arm32;
 mod arm64;
 mod riscv;
+mod s390x;
 pub(crate) mod x86;

 /// Represents known ISA target.
@@ -15,6 +16,7 @@ pub enum Isa {
    X86,
    Arm32,
    Arm64,
+    S390x,
 }

 impl Isa {
@@ -31,6 +33,7 @@ impl Isa {
        match arch {
            "riscv" => Some(Isa::Riscv),
            "aarch64" => Some(Isa::Arm64),
+            "s390x" => Some(Isa::S390x),
            x if ["x86_64", "i386", "i586", "i686"].contains(&x) => Some(Isa::X86),
            x if x.starts_with("arm") || arch.starts_with("thumb") => Some(Isa::Arm32),
            _ => None,
@@ -39,7 +42,7 @@ impl Isa {

    /// Returns all supported isa targets.
    pub fn all() -> &'static [Isa] {
-        &[Isa::Riscv, Isa::X86, Isa::Arm32, Isa::Arm64]
+        &[Isa::Riscv, Isa::X86, Isa::Arm32, Isa::Arm64, Isa::S390x]
    }
 }

@@ -51,6 +54,7 @@ impl fmt::Display for Isa {
            Isa::X86 => write!(f, "x86"),
            Isa::Arm32 => write!(f, "arm32"),
            Isa::Arm64 => write!(f, "arm64"),
+            Isa::S390x => write!(f, "s390x"),
        }
    }
 }
@@ -62,6 +66,7 @@ pub(crate) fn define(isas: &[Isa], shared_defs: &mut SharedDefinitions) -> Vec<T
            Isa::X86 => x86::define(shared_defs),
            Isa::Arm32 => arm32::define(shared_defs),
            Isa::Arm64 => arm64::define(shared_defs),
+            Isa::S390x => s390x::define(shared_defs),
        })
        .collect()
 }
--- a/cranelift/codegen/meta/src/isa/riscv/mod.rs
+++ b/cranelift/codegen/meta/src/isa/riscv/mod.rs
@@ -17,33 +17,39 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup {
    let supports_m = setting.add_bool(
        "supports_m",
        "CPU supports the 'M' extension (mul/div)",
+        "",
        false,
    );
    let supports_a = setting.add_bool(
        "supports_a",
        "CPU supports the 'A' extension (atomics)",
+        "",
        false,
    );
    let supports_f = setting.add_bool(
        "supports_f",
        "CPU supports the 'F' extension (float)",
+        "",
        false,
    );
    let supports_d = setting.add_bool(
        "supports_d",
        "CPU supports the 'D' extension (double)",
+        "",
        false,
    );

    let enable_m = setting.add_bool(
        "enable_m",
        "Enable the use of 'M' instructions if available",
+        "",
        true,
    );

    setting.add_bool(
        "enable_e",
        "Enable the 'RV32E' instruction set with only 16 registers",
+        "",
        false,
    );

--- a/cranelift/codegen/meta/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/meta/src/isa/s390x/mod.rs
@@ -0,0 +1,31 @@
+use crate::cdsl::cpu_modes::CpuMode;
+use crate::cdsl::instructions::{InstructionGroupBuilder, InstructionPredicateMap};
+use crate::cdsl::isa::TargetIsa;
+use crate::cdsl::recipes::Recipes;
+use crate::cdsl::regs::IsaRegsBuilder;
+use crate::cdsl::settings::SettingGroupBuilder;
+
+use crate::shared::Definitions as SharedDefinitions;
+
+pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
+    let inst_group = InstructionGroupBuilder::new(&mut shared_defs.all_instructions).build();
+    let settings = SettingGroupBuilder::new("s390x").build();
+    let regs = IsaRegsBuilder::new().build();
+    let recipes = Recipes::new();
+    let encodings_predicates = InstructionPredicateMap::new();
+
+    let mut mode = CpuMode::new("s390x");
+    let expand = shared_defs.transform_groups.by_name("expand");
+    mode.legalize_default(expand);
+    let cpu_modes = vec![mode];
+
+    TargetIsa::new(
+        "s390x",
+        inst_group,
+        settings,
+        regs,
+        recipes,
+        cpu_modes,
+        encodings_predicates,
+    )
+}
--- a/cranelift/codegen/meta/src/isa/x86/settings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/settings.rs
@@ -4,37 +4,77 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
    let mut settings = SettingGroupBuilder::new("x86");

    // CPUID.01H:ECX
-    let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false);
-    let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false);
-    let has_sse41 = settings.add_bool("has_sse41", "SSE4.1: CPUID.01H:ECX.SSE4_1[bit 19]", false);
-    let has_sse42 = settings.add_bool("has_sse42", "SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]", false);
-    let has_avx = settings.add_bool("has_avx", "AVX: CPUID.01H:ECX.AVX[bit 28]", false);
-    let has_avx2 = settings.add_bool("has_avx2", "AVX2: CPUID.07H:EBX.AVX2[bit 5]", false);
+    let has_sse3 = settings.add_bool(
+        "has_sse3",
+        "Has support for SSE3.",
+        "SSE3: CPUID.01H:ECX.SSE3[bit 0]",
+        false,
+    );
+    let has_ssse3 = settings.add_bool(
+        "has_ssse3",
+        "Has support for SSSE3.",
+        "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]",
+        false,
+    );
+    let has_sse41 = settings.add_bool(
+        "has_sse41",
+        "Has support for SSE4.1.",
+        "SSE4.1: CPUID.01H:ECX.SSE4_1[bit 19]",
+        false,
+    );
+    let has_sse42 = settings.add_bool(
+        "has_sse42",
+        "Has support for SSE4.2.",
+        "SSE4.2: CPUID.01H:ECX.SSE4_2[bit 20]",
+        false,
+    );
+    let has_avx = settings.add_bool(
+        "has_avx",
+        "Has support for AVX.",
+        "AVX: CPUID.01H:ECX.AVX[bit 28]",
+        false,
+    );
+    let has_avx2 = settings.add_bool(
+        "has_avx2",
+        "Has support for AVX2.",
+        "AVX2: CPUID.07H:EBX.AVX2[bit 5]",
+        false,
+    );
    let has_avx512dq = settings.add_bool(
        "has_avx512dq",
+        "Has support for AVX512DQ.",
        "AVX512DQ: CPUID.07H:EBX.AVX512DQ[bit 17]",
        false,
    );
    let has_avx512vl = settings.add_bool(
        "has_avx512vl",
+        "Has support for AVX512VL.",
        "AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]",
        false,
    );
    let has_avx512f = settings.add_bool(
        "has_avx512f",
+        "Has support for AVX512F.",
        "AVX512F: CPUID.07H:EBX.AVX512F[bit 16]",
        false,
    );
-    let has_popcnt = settings.add_bool("has_popcnt", "POPCNT: CPUID.01H:ECX.POPCNT[bit 23]", false);
+    let has_popcnt = settings.add_bool(
+        "has_popcnt",
+        "Has support for POPCNT.",
+        "POPCNT: CPUID.01H:ECX.POPCNT[bit 23]",
+        false,
+    );

    // CPUID.(EAX=07H, ECX=0H):EBX
    let has_bmi1 = settings.add_bool(
        "has_bmi1",
+        "Has support for BMI1.",
        "BMI1: CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]",
        false,
    );
    let has_bmi2 = settings.add_bool(
        "has_bmi2",
+        "Has support for BMI2.",
        "BMI2: CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]",
        false,
    );
@@ -42,6 +82,7 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
    // CPUID.EAX=80000001H:ECX
    let has_lzcnt = settings.add_bool(
        "has_lzcnt",
+        "Has support for LZCNT.",
        "LZCNT: CPUID.EAX=80000001H:ECX.LZCNT[bit 5]",
        false,
    );
@@ -85,7 +126,7 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
    settings.add_predicate("use_lzcnt", predicate!(has_lzcnt));

    // Some shared boolean values are used in x86 instruction predicates, so we need to group them
-    // in the same TargetIsa, for compabitibity with code generated by meta-python.
+    // in the same TargetIsa, for compatibility with code generated by meta-python.
    // TODO Once all the meta generation code has been migrated from Python to Rust, we can put it
    // back in the shared SettingGroup, and use it in x86 instruction predicates.

@@ -104,21 +145,40 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {

    // Presets corresponding to x86 CPUs.

-    settings.add_preset("baseline", preset!());
+    settings.add_preset(
+        "baseline",
+        "A baseline preset with no extensions enabled.",
+        preset!(),
+    );
    let nehalem = settings.add_preset(
        "nehalem",
+        "Nehalem microarchitecture.",
        preset!(has_sse3 && has_ssse3 && has_sse41 && has_sse42 && has_popcnt),
    );
    let haswell = settings.add_preset(
        "haswell",
+        "Haswell microarchitecture.",
        preset!(nehalem && has_bmi1 && has_bmi2 && has_lzcnt),
    );
-    let broadwell = settings.add_preset("broadwell", preset!(haswell));
-    let skylake = settings.add_preset("skylake", preset!(broadwell));
-    let cannonlake = settings.add_preset("cannonlake", preset!(skylake));
-    settings.add_preset("icelake", preset!(cannonlake));
+    let broadwell = settings.add_preset(
+        "broadwell",
+        "Broadwell microarchitecture.",
+        preset!(haswell),
+    );
+    let skylake = settings.add_preset("skylake", "Skylake microarchitecture.", preset!(broadwell));
+    let cannonlake = settings.add_preset(
+        "cannonlake",
+        "Canon Lake microarchitecture.",
+        preset!(skylake),
+    );
+    settings.add_preset(
+        "icelake",
+        "Ice Lake microarchitecture.",
+        preset!(cannonlake),
+    );
    settings.add_preset(
        "znver1",
+        "Zen (first generation) microarchitecture.",
        preset!(
            has_sse3
                && has_ssse3
--- a/cranelift/codegen/meta/src/lib.rs
+++ b/cranelift/codegen/meta/src/lib.rs
@@ -116,6 +116,9 @@ pub fn generate(
            isa::Isa::Arm64 => {
                // aarch64 doesn't have platform-specific settings.
            }
+            isa::Isa::S390x => {
+                // s390x doesn't have platform-specific settings.
+            }
            isa::Isa::Arm32 | isa::Isa::Riscv => todo!(),
        }
    }
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3582,7 +3582,7 @@ pub(crate) fn define(
            "fmin_pseudo",
            r#"
        Floating point pseudo-minimum, propagating NaNs.  This behaves differently from ``fmin``.
-        See https://github.com/WebAssembly/simd/pull/122 for background.
+        See <https://github.com/WebAssembly/simd/pull/122> for background.

        The behaviour is defined as ``fmin_pseudo(a, b) = (b < a) ? b : a``, and the behaviour
        for zero or NaN inputs follows from the behaviour of ``<`` with such inputs.
@@ -3614,7 +3614,7 @@ pub(crate) fn define(
            "fmax_pseudo",
            r#"
        Floating point pseudo-maximum, propagating NaNs.  This behaves differently from ``fmax``.
-        See https://github.com/WebAssembly/simd/pull/122 for background.
+        See <https://github.com/WebAssembly/simd/pull/122> for background.

        The behaviour is defined as ``fmax_pseudo(a, b) = (a < b) ? b : a``, and the behaviour
        for zero or NaN inputs follows from the behaviour of ``<`` with such inputs.
@@ -4102,7 +4102,7 @@ pub(crate) fn define(
        This will double the lane width and halve the number of lanes.  So the resulting
        vector has the same number of bits as `x` and `y` do (individually).

-        See https://github.com/WebAssembly/simd/pull/127 for background info.
+        See <https://github.com/WebAssembly/simd/pull/127> for background info.
            "#,
            &formats.binary,
        )
@@ -4325,6 +4325,26 @@ pub(crate) fn define(
        .operands_out(vec![a]),
    );

+    ig.push(
+        Inst::new(
+            "fcvt_low_from_sint",
+            r#"
+        Converts packed signed doubleword integers to packed double precision floating point.
+
+        Considering only the low half of the register, each lane in `x` is interpreted as a
+        signed doubleword integer that is then converted to a double precision float. This
+        instruction differs from fcvt_from_sint in that it converts half the number of lanes
+        which are converted to occupy twice the number of bits. No rounding should be needed
+        for the resulting float.
+
+        The result type will have half the number of vector lanes as the input.
+        "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
    let WideInt = &TypeVar::new(
        "WideInt",
        "An integer type with lanes from `i16` upwards",
--- a/cranelift/codegen/meta/src/shared/settings.rs
+++ b/cranelift/codegen/meta/src/shared/settings.rs
@@ -5,8 +5,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_enum(
        "regalloc",
-        r#"Register allocator to use with the MachInst backend.
-
+        "Register allocator to use with the MachInst backend.",
+        r#"
            This selects the register allocator as an option among those offered by the `regalloc.rs`
            crate. Please report register allocation bugs to the maintainers of this crate whenever
            possible.
@@ -38,22 +38,21 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_enum(
        "opt_level",
+        "Optimization level for generated code.",
        r#"
-        Optimization level:
+            Supported levels:

-        - none: Minimise compile time by disabling most optimizations.
-        - speed: Generate the fastest possible code
-        - speed_and_size: like "speed", but also perform transformations
-          aimed at reducing code size.
+            - `none`: Minimise compile time by disabling most optimizations.
+            - `speed`: Generate the fastest possible code
+            - `speed_and_size`: like "speed", but also perform transformations aimed at reducing code size.
        "#,
        vec!["none", "speed", "speed_and_size"],
    );

    settings.add_bool(
        "enable_verifier",
+        "Run the Cranelift IR verifier at strategic times during compilation.",
        r#"
-        Run the Cranelift IR verifier at strategic times during compilation.
-
            This makes compilation slower but catches many bugs. The verifier is always enabled by
            default, which is useful during development.
        "#,
@@ -65,15 +64,15 @@ pub(crate) fn define() -> SettingGroup {
    // `colocated` flag on external functions and global values.
    settings.add_bool(
        "is_pic",
-        "Enable Position-Independent Code generation",
+        "Enable Position-Independent Code generation.",
+        "",
        false,
    );

    settings.add_bool(
        "use_colocated_libcalls",
+        "Use colocated libcalls.",
        r#"
-            Use colocated libcalls.
-
            Generate code that assumes that libcalls can be declared "colocated",
            meaning they will be defined along with the current function, such that
            they can use more efficient addressing.
@@ -83,10 +82,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_bool(
        "avoid_div_traps",
+        "Generate explicit checks around native division instructions to avoid their trapping.",
        r#"
-            Generate explicit checks around native division instructions to avoid
-            their trapping.
-
            This is primarily used by SpiderMonkey which doesn't install a signal
            handler for SIGFPE, but expects a SIGILL trap for division by zero.

@@ -98,9 +95,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_bool(
        "enable_float",
+        "Enable the use of floating-point instructions.",
        r#"
-            Enable the use of floating-point instructions
-
            Disabling use of floating-point instructions is not yet implemented.
        "#,
        true,
@@ -108,9 +104,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_bool(
        "enable_nan_canonicalization",
+        "Enable NaN canonicalization.",
        r#"
-            Enable NaN canonicalization
-
            This replaces NaNs with a single canonical value, for users requiring
            entirely deterministic WebAssembly computation. This is not required
            by the WebAssembly spec, so it is not enabled by default.
@@ -120,8 +115,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_bool(
        "enable_pinned_reg",
-        r#"Enable the use of the pinned register.
-
+        "Enable the use of the pinned register.",
+        r#"
            This register is excluded from register allocation, and is completely under the control of
            the end-user. It is possible to read it via the get_pinned_reg instruction, and to set it
            with the set_pinned_reg instruction.
@@ -131,8 +126,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_bool(
        "use_pinned_reg_as_heap_base",
-        r#"Use the pinned register as the heap base.
-
+        "Use the pinned register as the heap base.",
+        r#"
            Enabling this requires the enable_pinned_reg setting to be set to true. It enables a custom
            legalization of the `heap_addr` instruction so it will use the pinned register as the heap
            base, instead of fetching it from a global value.
@@ -144,19 +139,24 @@ pub(crate) fn define() -> SettingGroup {
        false,
    );

-    settings.add_bool("enable_simd", "Enable the use of SIMD instructions.", false);
+    settings.add_bool(
+        "enable_simd",
+        "Enable the use of SIMD instructions.",
+        "",
+        false,
+    );

    settings.add_bool(
        "enable_atomics",
        "Enable the use of atomic instructions",
+        "",
        true,
    );

    settings.add_bool(
        "enable_safepoints",
+        "Enable safepoint instruction insertions.",
        r#"
-            Enable safepoint instruction insertions.
-
            This will allow the emit_stack_maps() function to insert the safepoint
            instruction on top of calls and interrupt traps in order to display the
            live reference values at that point in the program.
@@ -166,9 +166,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_enum(
        "tls_model",
-        r#"
-            Defines the model used to perform TLS accesses.
-        "#,
+        "Defines the model used to perform TLS accesses.",
+        "",
        vec!["none", "elf_gd", "macho", "coff"],
    );

@@ -176,9 +175,9 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_enum(
        "libcall_call_conv",
+        "Defines the calling convention to use for LibCalls call expansion.",
        r#"
-            Defines the calling convention to use for LibCalls call expansion,
-            since it may be different from the ISA default calling convention.
+            This may be different from the ISA default calling convention.

            The default value is to use the same calling convention as the ISA
            default calling convention.
@@ -192,6 +191,7 @@ pub(crate) fn define() -> SettingGroup {
            "cold",
            "system_v",
            "windows_fastcall",
+            "apple_aarch64",
            "baldrdash_system_v",
            "baldrdash_windows",
            "baldrdash_2020",
@@ -201,9 +201,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_num(
        "baldrdash_prologue_words",
+        "Number of pointer-sized words pushed by the baldrdash prologue.",
        r#"
-            Number of pointer-sized words pushed by the baldrdash prologue.
-
            Functions with the `baldrdash` calling convention don't generate their
            own prologue and epilogue. They depend on externally generated code
            that pushes a fixed number of words in the prologue and restores them
@@ -218,9 +217,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_bool(
        "enable_llvm_abi_extensions",
+        "Enable various ABI extensions defined by LLVM's behavior.",
        r#"
-            Enable various ABI extensions defined by LLVM's behavior.
-
            In some cases, LLVM's implementation of an ABI (calling convention)
            goes beyond a standard and supports additional argument types or
            behavior. This option instructs Cranelift codegen to follow LLVM's
@@ -237,12 +235,12 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_bool(
        "unwind_info",
+        "Generate unwind information.",
        r#"
-           Generate unwind info. This increases metadata size and compile time,
-           but allows for the debugger to trace frames, is needed for GC tracing
-           that relies on libunwind (such as in Wasmtime), and is
-           unconditionally needed on certain platforms (such as Windows) that
-           must always be able to unwind.
+            This increases metadata size and compile time, but allows for the
+            debugger to trace frames, is needed for GC tracing that relies on
+            libunwind (such as in Wasmtime), and is unconditionally needed on
+            certain platforms (such as Windows) that must always be able to unwind.
          "#,
        true,
    );
@@ -252,6 +250,7 @@ pub(crate) fn define() -> SettingGroup {
    settings.add_bool(
        "emit_all_ones_funcaddrs",
        "Emit not-yet-relocated function addresses as all-ones bit patterns.",
+        "",
        false,
    );

@@ -259,27 +258,22 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_bool(
        "enable_probestack",
-        r#"
-            Enable the use of stack probes, for calling conventions which support this
-            functionality.
-            "#,
+        "Enable the use of stack probes for supported calling conventions.",
+        "",
        true,
    );

    settings.add_bool(
        "probestack_func_adjusts_sp",
-        r#"
-            Set this to true of the stack probe function modifies the stack pointer
-            itself.
-            "#,
+        "Enable if the stack probe adjusts the stack pointer.",
+        "",
        false,
    );

    settings.add_num(
        "probestack_size_log2",
+        "The log2 of the size of the stack guard region.",
        r#"
-            The log2 of the size of the stack guard region.
-
            Stack frames larger than this size will have stack overflow checked
            by calling the probestack function.

@@ -293,6 +287,7 @@ pub(crate) fn define() -> SettingGroup {
    settings.add_bool(
        "enable_jump_tables",
        "Enable the use of jump tables in generated machine code.",
+        "",
        true,
    );

@@ -300,9 +295,8 @@ pub(crate) fn define() -> SettingGroup {

    settings.add_bool(
        "enable_heap_access_spectre_mitigation",
+        "Enable Spectre mitigation on heap bounds checks.",
        r#"
-        Enable Spectre mitigation on heap bounds checks.
-
            This is a no-op for any heap that needs no bounds checks; e.g.,
            if the limit is static and the guard region is large enough that
            the index cannot reach past it.
--- a/cranelift/codegen/shared/Cargo.toml
+++ b/cranelift/codegen/shared/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 authors = ["The Cranelift Project Developers"]
 name = "cranelift-codegen-shared"
-version = "0.72.0"
+version = "0.73.0"
 description = "For code shared between cranelift-codegen-meta and cranelift-codegen"
 license = "Apache-2.0 WITH LLVM-exception"
 repository = "https://github.com/bytecodealliance/wasmtime"
--- a/cranelift/codegen/src/binemit/mod.rs
+++ b/cranelift/codegen/src/binemit/mod.rs
@@ -60,6 +60,8 @@ pub enum Reloc {
    Arm64Call,
    /// RISC-V call target
    RiscvCall,
+    /// s390x PC-relative 4-byte offset
+    S390xPCRel32Dbl,

    /// Elf x86_64 32 bit signed PC relative offset to two GOT entries for GD symbol.
    ElfX86_64TlsGd,
@@ -75,6 +77,7 @@ impl fmt::Display for Reloc {
        match *self {
            Self::Abs4 => write!(f, "Abs4"),
            Self::Abs8 => write!(f, "Abs8"),
+            Self::S390xPCRel32Dbl => write!(f, "PCRel32Dbl"),
            Self::X86PCRel4 => write!(f, "PCRel4"),
            Self::X86PCRelRodata4 => write!(f, "PCRelRodata4"),
            Self::X86CallPCRel4 => write!(f, "CallPCRel4"),
--- a/cranelift/codegen/src/context.rs
+++ b/cranelift/codegen/src/context.rs
@@ -267,13 +267,7 @@ impl Context {
        isa: &dyn TargetIsa,
    ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> {
        if let Some(backend) = isa.get_mach_backend() {
-            use crate::isa::CallConv;
-            use crate::machinst::UnwindInfoKind;
-            let unwind_info_kind = match self.func.signature.call_conv {
-                CallConv::Fast | CallConv::Cold | CallConv::SystemV => UnwindInfoKind::SystemV,
-                CallConv::WindowsFastcall => UnwindInfoKind::Windows,
-                _ => UnwindInfoKind::None,
-            };
+            let unwind_info_kind = isa.unwind_info_kind();
            let result = self.mach_compile_result.as_ref().unwrap();
            return backend.emit_unwind_info(result, unwind_info_kind);
        }
--- a/cranelift/codegen/src/data_value.rs
+++ b/cranelift/codegen/src/data_value.rs
@@ -5,7 +5,6 @@ use crate::ir::{types, ConstantData, Type};
 use core::convert::TryInto;
 use core::fmt::{self, Display, Formatter};
 use core::ptr;
-use thiserror::Error;

 /// Represent a data value. Where [Value] is an SSA reference, [DataValue] is the type + value
 /// that would be referred to by a [Value].
@@ -97,15 +96,38 @@ impl DataValue {
 }

 /// Record failures to cast [DataValue].
-#[derive(Error, Debug, PartialEq)]
+#[derive(Debug, PartialEq)]
 #[allow(missing_docs)]
 pub enum DataValueCastFailure {
-    #[error("unable to cast data value of type {0} to type {1}")]
    TryInto(Type, Type),
-    #[error("unable to cast i64({0}) to a data value of type {1}")]
    FromInteger(i64, Type),
 }

+// This is manually implementing Error and Display instead of using thiserror to reduce the amount
+// of dependencies used by Cranelift.
+impl std::error::Error for DataValueCastFailure {}
+
+impl Display for DataValueCastFailure {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        match self {
+            DataValueCastFailure::TryInto(from, to) => {
+                write!(
+                    f,
+                    "unable to cast data value of type {} to type {}",
+                    from, to
+                )
+            }
+            DataValueCastFailure::FromInteger(val, to) => {
+                write!(
+                    f,
+                    "unable to cast i64({}) to a data value of type {}",
+                    val, to
+                )
+            }
+        }
+    }
+}
+
 /// Helper for creating conversion implementations for [DataValue].
 macro_rules! build_conversion_impl {
    ( $rust_ty:ty, $data_value_ty:ident, $cranelift_ty:ident ) => {
--- a/cranelift/codegen/src/ir/entities.rs
+++ b/cranelift/codegen/src/ir/entities.rs
@@ -146,7 +146,7 @@ impl StackSlot {
 /// [`VmContext`](super::GlobalValueData::VMContext) using
 /// [`FuncEnvironment::make_global`](https://docs.rs/cranelift-wasm/*/cranelift_wasm/trait.FuncEnvironment.html#tymethod.make_global).
 /// - When compiling to native code, you can use it for objects in static memory with
-/// [`Module::declare_data_in_func`](https://docs.rs/cranelift-module/*/cranelift_module/struct.Module.html#method.declare_data_in_func).
+/// [`Module::declare_data_in_func`](https://docs.rs/cranelift-module/*/cranelift_module/trait.Module.html#method.declare_data_in_func).
 /// - For any compilation target, it can be registered with
 /// [`FunctionBuilder::create_global_value`](https://docs.rs/cranelift-frontend/*/cranelift_frontend/struct.FunctionBuilder.html#method.create_global_value).
 ///
@@ -264,9 +264,9 @@ impl JumpTable {
 ///
 /// - [`FunctionBuilder::import_function`](https://docs.rs/cranelift-frontend/*/cranelift_frontend/struct.FunctionBuilder.html#method.import_function)
 /// for external functions
-/// - [`Module::declare_func_in_func`](https://docs.rs/cranelift-module/*/cranelift_module/struct.Module.html#method.declare_func_in_func)
+/// - [`Module::declare_func_in_func`](https://docs.rs/cranelift-module/*/cranelift_module/trait.Module.html#method.declare_func_in_func)
 /// for functions declared elsewhere in the same native
-/// [`Module`](https://docs.rs/cranelift-module/*/cranelift_module/struct.Module.html)
+/// [`Module`](https://docs.rs/cranelift-module/*/cranelift_module/trait.Module.html)
 /// - [`FuncEnvironment::make_direct_func`](https://docs.rs/cranelift-wasm/*/cranelift_wasm/trait.FuncEnvironment.html#tymethod.make_direct_func)
 /// for functions declared in the same WebAssembly
 /// [`FuncEnvironment`](https://docs.rs/cranelift-wasm/*/cranelift_wasm/trait.FuncEnvironment.html#tymethod.make_direct_func)
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -135,12 +135,28 @@ impl Into<AMode> for StackAMode {
 // Returns the size of stack space needed to store the
 // `int_reg` and `vec_reg`.
 fn saved_reg_stack_size(
+    call_conv: isa::CallConv,
    int_reg: &[Writable<RealReg>],
    vec_reg: &[Writable<RealReg>],
 ) -> (usize, usize) {
    // Round up to multiple of 2, to keep 16-byte stack alignment.
    let int_save_bytes = (int_reg.len() + (int_reg.len() & 1)) * 8;
-    let vec_save_bytes = vec_reg.len() * 16;
+    // The Baldrdash ABIs require saving and restoring the whole 16-byte
+    // SIMD & FP registers, so the necessary stack space is always a
+    // multiple of the mandatory 16-byte stack alignment. However, the
+    // Procedure Call Standard for the Arm 64-bit Architecture (AAPCS64,
+    // including several related ABIs such as the one used by Windows)
+    // mandates saving only the bottom 8 bytes of the vector registers,
+    // so in that case we round up the number of registers to ensure proper
+    // stack alignment (similarly to the situation with `int_reg`).
+    let vec_reg_size = if call_conv.extends_baldrdash() { 16 } else { 8 };
+    let vec_save_padding = if call_conv.extends_baldrdash() {
+        0
+    } else {
+        vec_reg.len() & 1
+    };
+    let vec_save_bytes = (vec_reg.len() + vec_save_padding) * vec_reg_size;
+
    (int_save_bytes, vec_save_bytes)
 }

@@ -171,6 +187,21 @@ impl ABIMachineSpec for AArch64MachineDeps {
        let has_baldrdash_tls = call_conv == isa::CallConv::Baldrdash2020;

        // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4.
+        //
+        // MacOS aarch64 is slightly different, see also
+        // https://developer.apple.com/documentation/xcode/writing_arm64_code_for_apple_platforms.
+        // We are diverging from the MacOS aarch64 implementation in the
+        // following ways:
+        // - sign- and zero- extensions of data types less than 32 bits are not
+        // implemented yet.
+        // - i128 arguments passing isn't implemented yet in the standard (non
+        // MacOS) aarch64 ABI.
+        // - we align the arguments stack space to a 16-bytes boundary, while
+        // the MacOS allows aligning only on 8 bytes. In practice it means we're
+        // slightly overallocating when calling, which is fine, and doesn't
+        // break our other invariants that the stack is always allocated in
+        // 16-bytes chunks.
+
        let mut next_xreg = 0;
        let mut next_vreg = 0;
        let mut next_stack: u64 = 0;
@@ -182,18 +213,26 @@ impl ABIMachineSpec for AArch64MachineDeps {
            next_stack = 16;
        }

-        // Note on return values: on the regular non-baldrdash ABI, we may return values in 8
-        // registers for V128 and I64 registers independently of the number of register values
-        // returned in the other class. That is, we can return values in up to 8 integer and 8
-        // vector registers at once.
-        // In Baldrdash, we can only use one register for return value for all the register
-        // classes. That is, we can't return values in both one integer and one vector register;
-        // only one return value may be in a register.
+        let (max_per_class_reg_vals, mut remaining_reg_vals) = match args_or_rets {
+            ArgsOrRets::Args => (8, 16), // x0-x7 and v0-v7

-        let (max_per_class_reg_vals, mut remaining_reg_vals) = match (args_or_rets, is_baldrdash) {
-            (ArgsOrRets::Args, _) => (8, 16),     // x0-x7 and v0-v7
-            (ArgsOrRets::Rets, false) => (8, 16), // x0-x7 and v0-v7
-            (ArgsOrRets::Rets, true) => (1, 1),   // x0 or v0, but not both
+            // Note on return values: on the regular ABI, we may return values
+            // in 8 registers for V128 and I64 registers independently of the
+            // number of register values returned in the other class. That is,
+            // we can return values in up to 8 integer and
+            // 8 vector registers at once.
+            //
+            // In Baldrdash and Wasmtime, we can only use one register for
+            // return value for all the register classes. That is, we can't
+            // return values in both one integer and one vector register; only
+            // one return value may be in a register.
+            ArgsOrRets::Rets => {
+                if is_baldrdash || call_conv.extends_wasmtime() {
+                    (1, 1) // x0 or v0, but not both
+                } else {
+                    (8, 16) // x0-x7 and v0-v7
+                }
+            }
        };

        for i in 0..params.len() {
@@ -264,13 +303,27 @@ impl ABIMachineSpec for AArch64MachineDeps {
                *next_reg += 1;
                remaining_reg_vals -= 1;
            } else {
-                // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte
-                // stack alignment happens separately after all args.)
+                // Compute the stack slot's size.
                let size = (ty_bits(param.value_type) / 8) as u64;
-                let size = std::cmp::max(size, 8);
-                // Align.
+
+                let size = if call_conv == isa::CallConv::AppleAarch64
+                    || (call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets)
+                {
+                    // MacOS aarch64 and Wasmtime allow stack slots with
+                    // sizes less than 8 bytes. They still need to be
+                    // properly aligned on their natural data alignment,
+                    // though.
+                    size
+                } else {
+                    // Every arg takes a minimum slot of 8 bytes. (16-byte stack
+                    // alignment happens separately after all args.)
+                    std::cmp::max(size, 8)
+                };
+
+                // Align the stack slot.
                debug_assert!(size.is_power_of_two());
                next_stack = align_to(next_stack, size);
+
                ret.push(ABIArg::stack(
                    next_stack as i64,
                    param.value_type,
@@ -550,11 +603,13 @@ impl ABIMachineSpec for AArch64MachineDeps {
        flags: &settings::Flags,
        clobbers: &Set<Writable<RealReg>>,
        fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
    ) -> (u64, SmallVec<[Inst; 16]>) {
        let mut insts = SmallVec::new();
        let (clobbered_int, clobbered_vec) = get_regs_saved_in_prologue(call_conv, clobbers);

-        let (int_save_bytes, vec_save_bytes) = saved_reg_stack_size(&clobbered_int, &clobbered_vec);
+        let (int_save_bytes, vec_save_bytes) =
+            saved_reg_stack_size(call_conv, &clobbered_int, &clobbered_vec);
        let total_save_bytes = int_save_bytes + vec_save_bytes;
        let clobber_size = total_save_bytes as i32;

@@ -583,59 +638,170 @@ impl ABIMachineSpec for AArch64MachineDeps {
        // `frame_offset` tracks offset above start-of-clobbers for unwind-info
        // purposes.
        let mut clobber_offset = clobber_size as u32;
-        for reg_pair in clobbered_int.chunks(2) {
-            let (r1, r2) = if reg_pair.len() == 2 {
-                // .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg
-                (reg_pair[0].to_reg().to_reg(), reg_pair[1].to_reg().to_reg())
-            } else {
-                (reg_pair[0].to_reg().to_reg(), zero_reg())
-            };
+        let clobber_offset_change = 16;
+        let iter = clobbered_int.chunks_exact(2);

-            debug_assert!(r1.get_class() == RegClass::I64);
-            debug_assert!(r2.get_class() == RegClass::I64);
+        if let [rd] = iter.remainder() {
+            let rd = rd.to_reg().to_reg();

-            // stp r1, r2, [sp, #-16]!
-            insts.push(Inst::StoreP64 {
-                rt: r1,
-                rt2: r2,
-                mem: PairAMode::PreIndexed(
+            debug_assert_eq!(rd.get_class(), RegClass::I64);
+            // str rd, [sp, #-16]!
+            insts.push(Inst::Store64 {
+                rd,
+                mem: AMode::PreIndexed(
                    writable_stack_reg(),
-                    SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
+                    SImm9::maybe_from_i64(-clobber_offset_change).unwrap(),
                ),
                flags: MemFlags::trusted(),
            });
+
            if flags.unwind_info() {
-                clobber_offset -= 8;
-                if r2 != zero_reg() {
+                clobber_offset -= clobber_offset_change as u32;
                insts.push(Inst::Unwind {
                    inst: UnwindInst::SaveReg {
                        clobber_offset,
-                            reg: r2.to_real_reg(),
-                        },
-                    });
-                }
-                clobber_offset -= 8;
-                insts.push(Inst::Unwind {
-                    inst: UnwindInst::SaveReg {
-                        clobber_offset,
-                        reg: r1.to_real_reg(),
+                        reg: rd.to_real_reg(),
                    },
                });
            }
        }

-        for reg in clobbered_vec.iter() {
-            insts.push(Inst::FpuStore128 {
-                rd: reg.to_reg().to_reg(),
-                mem: AMode::PreIndexed(writable_stack_reg(), SImm9::maybe_from_i64(-16).unwrap()),
+        let mut iter = iter.rev();
+
+        while let Some([rt, rt2]) = iter.next() {
+            // .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg
+            let rt = rt.to_reg().to_reg();
+            let rt2 = rt2.to_reg().to_reg();
+
+            debug_assert!(rt.get_class() == RegClass::I64);
+            debug_assert!(rt2.get_class() == RegClass::I64);
+
+            // stp rt, rt2, [sp, #-16]!
+            insts.push(Inst::StoreP64 {
+                rt,
+                rt2,
+                mem: PairAMode::PreIndexed(
+                    writable_stack_reg(),
+                    SImm7Scaled::maybe_from_i64(-clobber_offset_change, types::I64).unwrap(),
+                ),
                flags: MemFlags::trusted(),
            });
+
            if flags.unwind_info() {
-                clobber_offset -= 16;
+                clobber_offset -= clobber_offset_change as u32;
                insts.push(Inst::Unwind {
                    inst: UnwindInst::SaveReg {
                        clobber_offset,
-                        reg: reg.to_reg(),
+                        reg: rt.to_real_reg(),
+                    },
+                });
+                insts.push(Inst::Unwind {
+                    inst: UnwindInst::SaveReg {
+                        clobber_offset: clobber_offset + (clobber_offset_change / 2) as u32,
+                        reg: rt2.to_real_reg(),
+                    },
+                });
+            }
+        }
+
+        let store_vec_reg = |rd| {
+            if call_conv.extends_baldrdash() {
+                Inst::FpuStore128 {
+                    rd,
+                    mem: AMode::PreIndexed(
+                        writable_stack_reg(),
+                        SImm9::maybe_from_i64(-clobber_offset_change).unwrap(),
+                    ),
+                    flags: MemFlags::trusted(),
+                }
+            } else {
+                Inst::FpuStore64 {
+                    rd,
+                    mem: AMode::PreIndexed(
+                        writable_stack_reg(),
+                        SImm9::maybe_from_i64(-clobber_offset_change).unwrap(),
+                    ),
+                    flags: MemFlags::trusted(),
+                }
+            }
+        };
+        let iter = clobbered_vec.chunks_exact(2);
+
+        if let [rd] = iter.remainder() {
+            let rd = rd.to_reg().to_reg();
+
+            debug_assert_eq!(rd.get_class(), RegClass::V128);
+            insts.push(store_vec_reg(rd));
+
+            if flags.unwind_info() {
+                clobber_offset -= clobber_offset_change as u32;
+                insts.push(Inst::Unwind {
+                    inst: UnwindInst::SaveReg {
+                        clobber_offset,
+                        reg: rd.to_real_reg(),
+                    },
+                });
+            }
+        }
+
+        let store_vec_reg_pair = |rt, rt2| {
+            if call_conv.extends_baldrdash() {
+                let clobber_offset_change = 32;
+
+                (
+                    Inst::FpuStoreP128 {
+                        rt,
+                        rt2,
+                        mem: PairAMode::PreIndexed(
+                            writable_stack_reg(),
+                            SImm7Scaled::maybe_from_i64(-clobber_offset_change, I8X16).unwrap(),
+                        ),
+                        flags: MemFlags::trusted(),
+                    },
+                    clobber_offset_change as u32,
+                )
+            } else {
+                let clobber_offset_change = 16;
+
+                (
+                    Inst::FpuStoreP64 {
+                        rt,
+                        rt2,
+                        mem: PairAMode::PreIndexed(
+                            writable_stack_reg(),
+                            SImm7Scaled::maybe_from_i64(-clobber_offset_change, F64).unwrap(),
+                        ),
+                        flags: MemFlags::trusted(),
+                    },
+                    clobber_offset_change as u32,
+                )
+            }
+        };
+        let mut iter = iter.rev();
+
+        while let Some([rt, rt2]) = iter.next() {
+            let rt = rt.to_reg().to_reg();
+            let rt2 = rt2.to_reg().to_reg();
+
+            debug_assert_eq!(rt.get_class(), RegClass::V128);
+            debug_assert_eq!(rt2.get_class(), RegClass::V128);
+
+            let (inst, clobber_offset_change) = store_vec_reg_pair(rt, rt2);
+
+            insts.push(inst);
+
+            if flags.unwind_info() {
+                clobber_offset -= clobber_offset_change;
+                insts.push(Inst::Unwind {
+                    inst: UnwindInst::SaveReg {
+                        clobber_offset,
+                        reg: rt.to_real_reg(),
+                    },
+                });
+                insts.push(Inst::Unwind {
+                    inst: UnwindInst::SaveReg {
+                        clobber_offset: clobber_offset + clobber_offset_change / 2,
+                        reg: rt2.to_real_reg(),
                    },
                });
            }
@@ -654,6 +820,7 @@ impl ABIMachineSpec for AArch64MachineDeps {
        flags: &settings::Flags,
        clobbers: &Set<Writable<RealReg>>,
        fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
    ) -> SmallVec<[Inst; 16]> {
        let mut insts = SmallVec::new();
        let (clobbered_int, clobbered_vec) = get_regs_saved_in_prologue(call_conv, clobbers);
@@ -663,31 +830,83 @@ impl ABIMachineSpec for AArch64MachineDeps {
            insts.extend(Self::gen_sp_reg_adjust(fixed_frame_storage_size as i32));
        }

-        for reg in clobbered_vec.iter().rev() {
-            insts.push(Inst::FpuLoad128 {
-                rd: Writable::from_reg(reg.to_reg().to_reg()),
-                mem: AMode::PostIndexed(writable_stack_reg(), SImm9::maybe_from_i64(16).unwrap()),
+        let load_vec_reg = |rd| {
+            if call_conv.extends_baldrdash() {
+                Inst::FpuLoad128 {
+                    rd,
+                    mem: AMode::PostIndexed(
+                        writable_stack_reg(),
+                        SImm9::maybe_from_i64(16).unwrap(),
+                    ),
                    flags: MemFlags::trusted(),
-            });
                }
-
-        for reg_pair in clobbered_int.chunks(2).rev() {
-            let (r1, r2) = if reg_pair.len() == 2 {
-                (
-                    reg_pair[0].map(|r| r.to_reg()),
-                    reg_pair[1].map(|r| r.to_reg()),
-                )
            } else {
-                (reg_pair[0].map(|r| r.to_reg()), writable_zero_reg())
+                Inst::FpuLoad64 {
+                    rd,
+                    mem: AMode::PostIndexed(
+                        writable_stack_reg(),
+                        SImm9::maybe_from_i64(16).unwrap(),
+                    ),
+                    flags: MemFlags::trusted(),
+                }
+            }
+        };
+        let load_vec_reg_pair = |rt, rt2| {
+            if call_conv.extends_baldrdash() {
+                Inst::FpuLoadP128 {
+                    rt,
+                    rt2,
+                    mem: PairAMode::PostIndexed(
+                        writable_stack_reg(),
+                        SImm7Scaled::maybe_from_i64(32, I8X16).unwrap(),
+                    ),
+                    flags: MemFlags::trusted(),
+                }
+            } else {
+                Inst::FpuLoadP64 {
+                    rt,
+                    rt2,
+                    mem: PairAMode::PostIndexed(
+                        writable_stack_reg(),
+                        SImm7Scaled::maybe_from_i64(16, F64).unwrap(),
+                    ),
+                    flags: MemFlags::trusted(),
+                }
+            }
        };

-            debug_assert!(r1.to_reg().get_class() == RegClass::I64);
-            debug_assert!(r2.to_reg().get_class() == RegClass::I64);
+        let mut iter = clobbered_vec.chunks_exact(2);

-            // ldp r1, r2, [sp], #16
+        while let Some([rt, rt2]) = iter.next() {
+            let rt = rt.map(|r| r.to_reg());
+            let rt2 = rt2.map(|r| r.to_reg());
+
+            debug_assert_eq!(rt.to_reg().get_class(), RegClass::V128);
+            debug_assert_eq!(rt2.to_reg().get_class(), RegClass::V128);
+            insts.push(load_vec_reg_pair(rt, rt2));
+        }
+
+        debug_assert!(iter.remainder().len() <= 1);
+
+        if let [rd] = iter.remainder() {
+            let rd = rd.map(|r| r.to_reg());
+
+            debug_assert_eq!(rd.to_reg().get_class(), RegClass::V128);
+            insts.push(load_vec_reg(rd));
+        }
+
+        let mut iter = clobbered_int.chunks_exact(2);
+
+        while let Some([rt, rt2]) = iter.next() {
+            let rt = rt.map(|r| r.to_reg());
+            let rt2 = rt2.map(|r| r.to_reg());
+
+            debug_assert_eq!(rt.to_reg().get_class(), RegClass::I64);
+            debug_assert_eq!(rt2.to_reg().get_class(), RegClass::I64);
+            // ldp rt, rt2, [sp], #16
            insts.push(Inst::LoadP64 {
-                rt: r1,
-                rt2: r2,
+                rt,
+                rt2,
                mem: PairAMode::PostIndexed(
                    writable_stack_reg(),
                    SImm7Scaled::maybe_from_i64(16, I64).unwrap(),
@@ -696,6 +915,20 @@ impl ABIMachineSpec for AArch64MachineDeps {
            });
        }

+        debug_assert!(iter.remainder().len() <= 1);
+
+        if let [rd] = iter.remainder() {
+            let rd = rd.map(|r| r.to_reg());
+
+            debug_assert_eq!(rd.to_reg().get_class(), RegClass::I64);
+            // ldr rd, [sp], #16
+            insts.push(Inst::ULoad64 {
+                rd,
+                mem: AMode::PostIndexed(writable_stack_reg(), SImm9::maybe_from_i64(16).unwrap()),
+                flags: MemFlags::trusted(),
+            });
+        }
+
        // If this is Baldrdash-2020, restore the callee (i.e., our) TLS
        // register. We may have allocated it for something else and clobbered
        // it, but the ABI expects us to leave the TLS register unchanged.
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -258,6 +258,28 @@ fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
        | machreg_to_vec(rt.to_reg())
 }

+fn enc_ldst_vec_pair(
+    opc: u32,
+    amode: u32,
+    is_load: bool,
+    simm7: SImm7Scaled,
+    rn: Reg,
+    rt: Reg,
+    rt2: Reg,
+) -> u32 {
+    debug_assert_eq!(opc & 0b11, opc);
+    debug_assert_eq!(amode & 0b11, amode);
+
+    0b00_10110_00_0_0000000_00000_00000_00000
+        | opc << 30
+        | amode << 23
+        | (is_load as u32) << 22
+        | simm7.bits() << 15
+        | machreg_to_vec(rt2) << 10
+        | machreg_to_gpr(rn) << 5
+        | machreg_to_vec(rt)
+}
+
 fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
    (top11 << 21)
        | (machreg_to_vec(rm) << 16)
@@ -405,6 +427,15 @@ fn enc_vec_rr_misc(qu: u32, size: u32, bits_12_16: u32, rd: Writable<Reg>, rn: R
        | machreg_to_vec(rd.to_reg())
 }

+fn enc_vec_rr_pair(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16);
+
+    0b010_11110_11_11000_11011_10_00000_00000
+        | bits_12_16 << 12
+        | machreg_to_vec(rn) << 5
+        | machreg_to_vec(rd.to_reg())
+}
+
 fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
    debug_assert_eq!(q & 0b1, q);
    debug_assert_eq!(u & 0b1, u);
@@ -923,7 +954,7 @@ impl MachInstEmit for Inst {

                let srcloc = state.cur_srcloc();
                if srcloc != SourceLoc::default() && !flags.notrap() {
-                    // Register the offset at which the actual load instruction starts.
+                    // Register the offset at which the actual store instruction starts.
                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
                }

@@ -987,7 +1018,7 @@ impl MachInstEmit for Inst {
            } => {
                let srcloc = state.cur_srcloc();
                if srcloc != SourceLoc::default() && !flags.notrap() {
-                    // Register the offset at which the actual load instruction starts.
+                    // Register the offset at which the actual store instruction starts.
                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
                }
                match mem {
@@ -1034,6 +1065,120 @@ impl MachInstEmit for Inst {
                    }
                }
            }
+            &Inst::FpuLoadP64 {
+                rt,
+                rt2,
+                ref mem,
+                flags,
+            }
+            | &Inst::FpuLoadP128 {
+                rt,
+                rt2,
+                ref mem,
+                flags,
+            } => {
+                let srcloc = state.cur_srcloc();
+
+                if srcloc != SourceLoc::default() && !flags.notrap() {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+
+                let opc = match self {
+                    &Inst::FpuLoadP64 { .. } => 0b01,
+                    &Inst::FpuLoadP128 { .. } => 0b10,
+                    _ => unreachable!(),
+                };
+                let rt = rt.to_reg();
+                let rt2 = rt2.to_reg();
+
+                match mem {
+                    &PairAMode::SignedOffset(reg, simm7) => {
+                        assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
+                        sink.put4(enc_ldst_vec_pair(opc, 0b10, true, simm7, reg, rt, rt2));
+                    }
+                    &PairAMode::PreIndexed(reg, simm7) => {
+                        assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
+                        sink.put4(enc_ldst_vec_pair(
+                            opc,
+                            0b11,
+                            true,
+                            simm7,
+                            reg.to_reg(),
+                            rt,
+                            rt2,
+                        ));
+                    }
+                    &PairAMode::PostIndexed(reg, simm7) => {
+                        assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
+                        sink.put4(enc_ldst_vec_pair(
+                            opc,
+                            0b01,
+                            true,
+                            simm7,
+                            reg.to_reg(),
+                            rt,
+                            rt2,
+                        ));
+                    }
+                }
+            }
+            &Inst::FpuStoreP64 {
+                rt,
+                rt2,
+                ref mem,
+                flags,
+            }
+            | &Inst::FpuStoreP128 {
+                rt,
+                rt2,
+                ref mem,
+                flags,
+            } => {
+                let srcloc = state.cur_srcloc();
+
+                if srcloc != SourceLoc::default() && !flags.notrap() {
+                    // Register the offset at which the actual store instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+
+                let opc = match self {
+                    &Inst::FpuStoreP64 { .. } => 0b01,
+                    &Inst::FpuStoreP128 { .. } => 0b10,
+                    _ => unreachable!(),
+                };
+
+                match mem {
+                    &PairAMode::SignedOffset(reg, simm7) => {
+                        assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
+                        sink.put4(enc_ldst_vec_pair(opc, 0b10, false, simm7, reg, rt, rt2));
+                    }
+                    &PairAMode::PreIndexed(reg, simm7) => {
+                        assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
+                        sink.put4(enc_ldst_vec_pair(
+                            opc,
+                            0b11,
+                            false,
+                            simm7,
+                            reg.to_reg(),
+                            rt,
+                            rt2,
+                        ));
+                    }
+                    &PairAMode::PostIndexed(reg, simm7) => {
+                        assert!(simm7.scale_ty == F64 || simm7.scale_ty == I8X16);
+                        sink.put4(enc_ldst_vec_pair(
+                            opc,
+                            0b01,
+                            false,
+                            simm7,
+                            reg.to_reg(),
+                            rt,
+                            rt2,
+                        ));
+                    }
+                }
+            }
            &Inst::Mov64 { rd, rm } => {
                assert!(rd.to_reg().get_class() == rm.get_class());
                assert!(rm.get_class() == RegClass::I64);
@@ -1492,6 +1637,7 @@ impl MachInstEmit for Inst {
                        debug_assert!(size == VectorSize::Size8x8 || size == VectorSize::Size8x16);
                        (0b0, 0b00101, enc_size)
                    }
+                    VecMisc2::Cmeq0 => (0b0, 0b01001, enc_size),
                };
                sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
            }
@@ -1918,6 +2064,13 @@ impl MachInstEmit for Inst {
                        | machreg_to_vec(rd.to_reg()),
                );
            }
+            &Inst::VecRRPair { op, rd, rn } => {
+                let bits_12_16 = match op {
+                    VecPairOp::Addp => 0b11011,
+                };
+
+                sink.put4(enc_vec_rr_pair(bits_12_16, rd, rn));
+            }
            &Inst::VecRRR {
                rd,
                rn,
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2311,6 +2311,16 @@ fn test_aarch64_binemit() {
        "sqxtun v16.8b, v23.8h",
    ));

+    insns.push((
+        Inst::VecRRPair {
+            op: VecPairOp::Addp,
+            rd: writable_vreg(0),
+            rn: vreg(30),
+        },
+        "C0BBF15E",
+        "addp d0, v30.2d",
+    ));
+
    insns.push((
        Inst::VecRRR {
            alu_op: VecALUOp::Sqadd,
@@ -3803,6 +3813,17 @@ fn test_aarch64_binemit() {
        "cnt v23.8b, v5.8b",
    ));

+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Cmeq0,
+            rd: writable_vreg(12),
+            rn: vreg(27),
+            size: VectorSize::Size16x8,
+        },
+        "6C9B604E",
+        "cmeq v12.8h, v27.8h, #0",
+    ));
+
    insns.push((
        Inst::VecLanes {
            op: VecLanesOp::Uminv,
@@ -5105,6 +5126,168 @@ fn test_aarch64_binemit() {
        "str q16, [x8, x9, LSL #4]",
    ));

+    insns.push((
+        Inst::FpuLoadP64 {
+            rt: writable_vreg(0),
+            rt2: writable_vreg(31),
+            mem: PairAMode::SignedOffset(xreg(0), SImm7Scaled::zero(F64)),
+            flags: MemFlags::trusted(),
+        },
+        "007C406D",
+        "ldp d0, d31, [x0]",
+    ));
+
+    insns.push((
+        Inst::FpuLoadP64 {
+            rt: writable_vreg(19),
+            rt2: writable_vreg(11),
+            mem: PairAMode::PreIndexed(
+                writable_xreg(25),
+                SImm7Scaled::maybe_from_i64(-512, F64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "332FE06D",
+        "ldp d19, d11, [x25, #-512]!",
+    ));
+
+    insns.push((
+        Inst::FpuLoadP64 {
+            rt: writable_vreg(7),
+            rt2: writable_vreg(20),
+            mem: PairAMode::PostIndexed(
+                writable_stack_reg(),
+                SImm7Scaled::maybe_from_i64(64, F64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "E753C46C",
+        "ldp d7, d20, [sp], #64",
+    ));
+
+    insns.push((
+        Inst::FpuStoreP64 {
+            rt: vreg(4),
+            rt2: vreg(26),
+            mem: PairAMode::SignedOffset(
+                stack_reg(),
+                SImm7Scaled::maybe_from_i64(504, F64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "E4EB1F6D",
+        "stp d4, d26, [sp, #504]",
+    ));
+
+    insns.push((
+        Inst::FpuStoreP64 {
+            rt: vreg(16),
+            rt2: vreg(8),
+            mem: PairAMode::PreIndexed(
+                writable_xreg(15),
+                SImm7Scaled::maybe_from_i64(48, F64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "F021836D",
+        "stp d16, d8, [x15, #48]!",
+    ));
+
+    insns.push((
+        Inst::FpuStoreP64 {
+            rt: vreg(5),
+            rt2: vreg(6),
+            mem: PairAMode::PostIndexed(
+                writable_xreg(28),
+                SImm7Scaled::maybe_from_i64(-32, F64).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "851BBE6C",
+        "stp d5, d6, [x28], #-32",
+    ));
+
+    insns.push((
+        Inst::FpuLoadP128 {
+            rt: writable_vreg(0),
+            rt2: writable_vreg(17),
+            mem: PairAMode::SignedOffset(xreg(3), SImm7Scaled::zero(I8X16)),
+            flags: MemFlags::trusted(),
+        },
+        "604440AD",
+        "ldp q0, q17, [x3]",
+    ));
+
+    insns.push((
+        Inst::FpuLoadP128 {
+            rt: writable_vreg(29),
+            rt2: writable_vreg(9),
+            mem: PairAMode::PreIndexed(
+                writable_xreg(16),
+                SImm7Scaled::maybe_from_i64(-1024, I8X16).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "1D26E0AD",
+        "ldp q29, q9, [x16, #-1024]!",
+    ));
+
+    insns.push((
+        Inst::FpuLoadP128 {
+            rt: writable_vreg(10),
+            rt2: writable_vreg(20),
+            mem: PairAMode::PostIndexed(
+                writable_xreg(26),
+                SImm7Scaled::maybe_from_i64(256, I8X16).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "4A53C8AC",
+        "ldp q10, q20, [x26], #256",
+    ));
+
+    insns.push((
+        Inst::FpuStoreP128 {
+            rt: vreg(9),
+            rt2: vreg(31),
+            mem: PairAMode::SignedOffset(
+                stack_reg(),
+                SImm7Scaled::maybe_from_i64(1008, I8X16).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "E9FF1FAD",
+        "stp q9, q31, [sp, #1008]",
+    ));
+
+    insns.push((
+        Inst::FpuStoreP128 {
+            rt: vreg(27),
+            rt2: vreg(13),
+            mem: PairAMode::PreIndexed(
+                writable_stack_reg(),
+                SImm7Scaled::maybe_from_i64(-192, I8X16).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "FB37BAAD",
+        "stp q27, q13, [sp, #-192]!",
+    ));
+
+    insns.push((
+        Inst::FpuStoreP128 {
+            rt: vreg(18),
+            rt2: vreg(22),
+            mem: PairAMode::PostIndexed(
+                writable_xreg(13),
+                SImm7Scaled::maybe_from_i64(304, I8X16).unwrap(),
+            ),
+            flags: MemFlags::trusted(),
+        },
+        "B2D989AC",
+        "stp q18, q22, [x13], #304",
+    ));
+
    insns.push((
        Inst::LoadFpuConst64 {
            rd: writable_vreg(16),
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -73,7 +73,7 @@ impl SImm7Scaled {
    /// Create a SImm7Scaled from a raw offset and the known scale type, if
    /// possible.
    pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option<SImm7Scaled> {
-        assert!(scale_ty == I64 || scale_ty == I32);
+        assert!(scale_ty == I64 || scale_ty == I32 || scale_ty == F64 || scale_ty == I8X16);
        let scale = scale_ty.bytes();
        assert!(scale.is_power_of_two());
        let scale = i64::from(scale);
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -334,6 +334,8 @@ pub enum VecMisc2 {
    Frintp,
    /// Population count per byte
    Cnt,
+    /// Compare bitwise equal to 0
+    Cmeq0,
 }

 /// A Vector narrowing operation with two registers.
@@ -347,6 +349,13 @@ pub enum VecMiscNarrowOp {
    Sqxtun,
 }

+/// A vector operation on a pair of elements with one register.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecPairOp {
+    /// Add pair of elements
+    Addp,
+}
+
 /// An operation across the lanes of vectors.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum VecLanesOp {
@@ -848,7 +857,34 @@ pub enum Inst {
        mem: AMode,
        flags: MemFlags,
    },
-
+    /// A load of a pair of floating-point registers, double precision (64-bit).
+    FpuLoadP64 {
+        rt: Writable<Reg>,
+        rt2: Writable<Reg>,
+        mem: PairAMode,
+        flags: MemFlags,
+    },
+    /// A store of a pair of floating-point registers, double precision (64-bit).
+    FpuStoreP64 {
+        rt: Reg,
+        rt2: Reg,
+        mem: PairAMode,
+        flags: MemFlags,
+    },
+    /// A load of a pair of floating-point registers, 128-bit.
+    FpuLoadP128 {
+        rt: Writable<Reg>,
+        rt2: Writable<Reg>,
+        mem: PairAMode,
+        flags: MemFlags,
+    },
+    /// A store of a pair of floating-point registers, 128-bit.
+    FpuStoreP128 {
+        rt: Reg,
+        rt2: Reg,
+        mem: PairAMode,
+        flags: MemFlags,
+    },
    LoadFpuConst64 {
        rd: Writable<Reg>,
        const_data: u64,
@@ -984,6 +1020,13 @@ pub enum Inst {
        high_half: bool,
    },

+    /// 1-operand vector instruction that operates on a pair of elements.
+    VecRRPair {
+        op: VecPairOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
    /// A vector ALU op.
    VecRRR {
        alu_op: VecALUOp,
@@ -1908,6 +1951,34 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_use(rd);
            memarg_regs(mem, collector);
        }
+        &Inst::FpuLoadP64 {
+            rt, rt2, ref mem, ..
+        } => {
+            collector.add_def(rt);
+            collector.add_def(rt2);
+            pairmemarg_regs(mem, collector);
+        }
+        &Inst::FpuStoreP64 {
+            rt, rt2, ref mem, ..
+        } => {
+            collector.add_use(rt);
+            collector.add_use(rt2);
+            pairmemarg_regs(mem, collector);
+        }
+        &Inst::FpuLoadP128 {
+            rt, rt2, ref mem, ..
+        } => {
+            collector.add_def(rt);
+            collector.add_def(rt2);
+            pairmemarg_regs(mem, collector);
+        }
+        &Inst::FpuStoreP128 {
+            rt, rt2, ref mem, ..
+        } => {
+            collector.add_use(rt);
+            collector.add_use(rt2);
+            pairmemarg_regs(mem, collector);
+        }
        &Inst::LoadFpuConst64 { rd, .. } | &Inst::LoadFpuConst128 { rd, .. } => {
            collector.add_def(rd);
        }
@@ -1973,6 +2044,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
                collector.add_def(rd);
            }
        }
+        &Inst::VecRRPair { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
        &Inst::VecRRR {
            alu_op, rd, rn, rm, ..
        } => {
@@ -2590,6 +2665,46 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            map_use(mapper, rd);
            map_mem(mapper, mem);
        }
+        &mut Inst::FpuLoadP64 {
+            ref mut rt,
+            ref mut rt2,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rt);
+            map_def(mapper, rt2);
+            map_pairmem(mapper, mem);
+        }
+        &mut Inst::FpuStoreP64 {
+            ref mut rt,
+            ref mut rt2,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rt);
+            map_use(mapper, rt2);
+            map_pairmem(mapper, mem);
+        }
+        &mut Inst::FpuLoadP128 {
+            ref mut rt,
+            ref mut rt2,
+            ref mut mem,
+            ..
+        } => {
+            map_def(mapper, rt);
+            map_def(mapper, rt2);
+            map_pairmem(mapper, mem);
+        }
+        &mut Inst::FpuStoreP128 {
+            ref mut rt,
+            ref mut rt2,
+            ref mut mem,
+            ..
+        } => {
+            map_use(mapper, rt);
+            map_use(mapper, rt2);
+            map_pairmem(mapper, mem);
+        }
        &mut Inst::LoadFpuConst64 { ref mut rd, .. } => {
            map_def(mapper, rd);
        }
@@ -2721,6 +2836,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
                map_def(mapper, rd);
            }
        }
+        &mut Inst::VecRRPair {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
        &mut Inst::VecRRR {
            alu_op,
            ref mut rd,
@@ -3508,6 +3631,42 @@ impl Inst {
                let mem = mem.show_rru(mb_rru);
                format!("{}str {}, {}", mem_str, rd, mem)
            }
+            &Inst::FpuLoadP64 {
+                rt, rt2, ref mem, ..
+            } => {
+                let rt = show_vreg_scalar(rt.to_reg(), mb_rru, ScalarSize::Size64);
+                let rt2 = show_vreg_scalar(rt2.to_reg(), mb_rru, ScalarSize::Size64);
+                let mem = mem.show_rru(mb_rru);
+
+                format!("ldp {}, {}, {}", rt, rt2, mem)
+            }
+            &Inst::FpuStoreP64 {
+                rt, rt2, ref mem, ..
+            } => {
+                let rt = show_vreg_scalar(rt, mb_rru, ScalarSize::Size64);
+                let rt2 = show_vreg_scalar(rt2, mb_rru, ScalarSize::Size64);
+                let mem = mem.show_rru(mb_rru);
+
+                format!("stp {}, {}, {}", rt, rt2, mem)
+            }
+            &Inst::FpuLoadP128 {
+                rt, rt2, ref mem, ..
+            } => {
+                let rt = show_vreg_scalar(rt.to_reg(), mb_rru, ScalarSize::Size128);
+                let rt2 = show_vreg_scalar(rt2.to_reg(), mb_rru, ScalarSize::Size128);
+                let mem = mem.show_rru(mb_rru);
+
+                format!("ldp {}, {}, {}", rt, rt2, mem)
+            }
+            &Inst::FpuStoreP128 {
+                rt, rt2, ref mem, ..
+            } => {
+                let rt = show_vreg_scalar(rt, mb_rru, ScalarSize::Size128);
+                let rt2 = show_vreg_scalar(rt2, mb_rru, ScalarSize::Size128);
+                let mem = mem.show_rru(mb_rru);
+
+                format!("stp {}, {}, {}", rt, rt2, mem)
+            }
            &Inst::LoadFpuConst64 { rd, const_data } => {
                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
                format!(
@@ -3725,6 +3884,15 @@ impl Inst {
                };
                format!("{} {}, {}", op, rd, rn)
            }
+            &Inst::VecRRPair { op, rd, rn } => {
+                let op = match op {
+                    VecPairOp::Addp => "addp",
+                };
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
+                let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size64x2);
+
+                format!("{} {}, {}", op, rd, rn)
+            }
            &Inst::VecRRR {
                rd,
                rn,
@@ -3788,43 +3956,44 @@ impl Inst {
                format!("{} {}, {}, {}", op, rd, rn, rm)
            }
            &Inst::VecMisc { op, rd, rn, size } => {
-                let is_shll = op == VecMisc2::Shll;
-                let suffix = match (is_shll, size) {
-                    (true, VectorSize::Size8x8) => ", #8",
-                    (true, VectorSize::Size16x4) => ", #16",
-                    (true, VectorSize::Size32x2) => ", #32",
-                    _ => "",
-                };
-
-                let (op, size) = match op {
-                    VecMisc2::Not => (
-                        "mvn",
-                        if size.is_128bits() {
+                let (op, rd_size, size, suffix) = match op {
+                    VecMisc2::Not => {
+                        let size = if size.is_128bits() {
                            VectorSize::Size8x16
                        } else {
                            VectorSize::Size8x8
-                        },
-                    ),
-                    VecMisc2::Neg => ("neg", size),
-                    VecMisc2::Abs => ("abs", size),
-                    VecMisc2::Fabs => ("fabs", size),
-                    VecMisc2::Fneg => ("fneg", size),
-                    VecMisc2::Fsqrt => ("fsqrt", size),
-                    VecMisc2::Rev64 => ("rev64", size),
-                    VecMisc2::Shll => ("shll", size),
-                    VecMisc2::Fcvtzs => ("fcvtzs", size),
-                    VecMisc2::Fcvtzu => ("fcvtzu", size),
-                    VecMisc2::Scvtf => ("scvtf", size),
-                    VecMisc2::Ucvtf => ("ucvtf", size),
-                    VecMisc2::Frintn => ("frintn", size),
-                    VecMisc2::Frintz => ("frintz", size),
-                    VecMisc2::Frintm => ("frintm", size),
-                    VecMisc2::Frintp => ("frintp", size),
-                    VecMisc2::Cnt => ("cnt", size),
                        };

-                let rd_size = if is_shll { size.widen() } else { size };
-
+                        ("mvn", size, size, "")
+                    }
+                    VecMisc2::Neg => ("neg", size, size, ""),
+                    VecMisc2::Abs => ("abs", size, size, ""),
+                    VecMisc2::Fabs => ("fabs", size, size, ""),
+                    VecMisc2::Fneg => ("fneg", size, size, ""),
+                    VecMisc2::Fsqrt => ("fsqrt", size, size, ""),
+                    VecMisc2::Rev64 => ("rev64", size, size, ""),
+                    VecMisc2::Shll => (
+                        "shll",
+                        size.widen(),
+                        size,
+                        match size {
+                            VectorSize::Size8x8 => ", #8",
+                            VectorSize::Size16x4 => ", #16",
+                            VectorSize::Size32x2 => ", #32",
+                            _ => panic!("Unexpected vector size: {:?}", size),
+                        },
+                    ),
+                    VecMisc2::Fcvtzs => ("fcvtzs", size, size, ""),
+                    VecMisc2::Fcvtzu => ("fcvtzu", size, size, ""),
+                    VecMisc2::Scvtf => ("scvtf", size, size, ""),
+                    VecMisc2::Ucvtf => ("ucvtf", size, size, ""),
+                    VecMisc2::Frintn => ("frintn", size, size, ""),
+                    VecMisc2::Frintz => ("frintz", size, size, ""),
+                    VecMisc2::Frintm => ("frintm", size, size, ""),
+                    VecMisc2::Frintp => ("frintp", size, size, ""),
+                    VecMisc2::Cnt => ("cnt", size, size, ""),
+                    VecMisc2::Cmeq0 => ("cmeq", size, size, ", #0"),
+                };
                let rd = show_vreg_vector(rd.to_reg(), mb_rru, rd_size);
                let rn = show_vreg_vector(rn, mb_rru, size);
                format!("{} {}, {}{}", op, rd, rn, suffix)
--- a/cranelift/codegen/src/isa/aarch64/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/unwind/systemv.rs
@@ -56,8 +56,8 @@ impl crate::isa::unwind::systemv::RegisterMapper<Reg> for RegisterMapper {
    fn sp(&self) -> u16 {
        regs::stack_reg().get_hw_encoding().into()
    }
-    fn fp(&self) -> u16 {
-        regs::fp_reg().get_hw_encoding().into()
+    fn fp(&self) -> Option<u16> {
+        Some(regs::fp_reg().get_hw_encoding().into())
    }
    fn lr(&self) -> Option<u16> {
        Some(regs::link_reg().get_hw_encoding().into())
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1950,6 +1950,40 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            }
        }

+        Opcode::VallTrue if ctx.input_ty(insn, 0) == I64X2 => {
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();
+
+            // cmeq vtmp.2d, vm.2d, #0
+            // addp dtmp, vtmp.2d
+            // fcmp dtmp, dtmp
+            // cset xd, eq
+            //
+            // Note that after the ADDP the value of the temporary register will
+            // be either 0 when all input elements are true, i.e. non-zero, or a
+            // NaN otherwise (either -1 or -2 when represented as an integer);
+            // NaNs are the only floating-point numbers that compare unequal to
+            // themselves.
+
+            ctx.emit(Inst::VecMisc {
+                op: VecMisc2::Cmeq0,
+                rd: tmp,
+                rn: rm,
+                size: VectorSize::Size64x2,
+            });
+            ctx.emit(Inst::VecRRPair {
+                op: VecPairOp::Addp,
+                rd: tmp,
+                rn: tmp.to_reg(),
+            });
+            ctx.emit(Inst::FpuCmp64 {
+                rn: tmp.to_reg(),
+                rm: tmp.to_reg(),
+            });
+            materialize_bool_result(ctx, insn, rd, Cond::Eq);
+        }
+
        Opcode::VanyTrue | Opcode::VallTrue => {
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -2180,6 +2214,47 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        size: VectorSize::Size32x4,
                    });
                }
+                I64X2 => {
+                    // mov dst_r, src_v.d[0]
+                    // mov tmp_r0, src_v.d[1]
+                    // lsr dst_r, dst_r, #63
+                    // lsr tmp_r0, tmp_r0, #63
+                    // add dst_r, dst_r, tmp_r0, lsl #1
+                    ctx.emit(Inst::MovFromVec {
+                        rd: dst_r,
+                        rn: src_v,
+                        idx: 0,
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::MovFromVec {
+                        rd: tmp_r0,
+                        rn: src_v,
+                        idx: 1,
+                        size: VectorSize::Size64x2,
+                    });
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: ALUOp::Lsr64,
+                        rd: dst_r,
+                        rn: dst_r.to_reg(),
+                        immshift: ImmShift::maybe_from_u64(63).unwrap(),
+                    });
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: ALUOp::Lsr64,
+                        rd: tmp_r0,
+                        rn: tmp_r0.to_reg(),
+                        immshift: ImmShift::maybe_from_u64(63).unwrap(),
+                    });
+                    ctx.emit(Inst::AluRRRShift {
+                        alu_op: ALUOp::Add32,
+                        rd: dst_r,
+                        rn: dst_r.to_reg(),
+                        rm: tmp_r0.to_reg(),
+                        shiftop: ShiftOpAndAmt::new(
+                            ShiftOp::LSL,
+                            ShiftOpShiftImm::maybe_from_shift(1).unwrap(),
+                        ),
+                    });
+                }
                _ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
            }
        }
@@ -3013,6 +3088,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }

        Opcode::TlsValue => unimplemented!("tls_value"),
+        Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
    }

    Ok(())
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -7,10 +7,8 @@ use crate::isa::Builder as IsaBuilder;
 use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
 use crate::result::CodegenResult;
 use crate::settings as shared_settings;
-
-use alloc::boxed::Box;
+use alloc::{boxed::Box, vec::Vec};
 use core::hash::{Hash, Hasher};
-
 use regalloc::{PrettyPrint, RealRegUniverse};
 use target_lexicon::{Aarch64Architecture, Architecture, Triple};

@@ -104,6 +102,10 @@ impl MachBackend for AArch64Backend {
        &self.flags
    }

+    fn isa_flags(&self) -> Vec<shared_settings::Value> {
+        self.isa_flags.iter().collect()
+    }
+
    fn hash_all_flags(&self, mut hasher: &mut dyn Hasher) {
        self.flags.hash(&mut hasher);
        self.isa_flags.hash(&mut hasher);
--- a/cranelift/codegen/src/isa/aarch64/settings.rs
+++ b/cranelift/codegen/src/isa/aarch64/settings.rs
@@ -1,6 +1,6 @@
 //! AArch64 Settings.

-use crate::settings::{self, detail, Builder};
+use crate::settings::{self, detail, Builder, Value};
 use core::fmt;

 // Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
--- a/cranelift/codegen/src/isa/arm32/abi.rs
+++ b/cranelift/codegen/src/isa/arm32/abi.rs
@@ -319,6 +319,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
        _flags: &settings::Flags,
        clobbers: &Set<Writable<RealReg>>,
        fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
    ) -> (u64, SmallVec<[Inst; 16]>) {
        let mut insts = SmallVec::new();
        if fixed_frame_storage_size > 0 {
@@ -348,6 +349,7 @@ impl ABIMachineSpec for Arm32MachineDeps {
        _flags: &settings::Flags,
        clobbers: &Set<Writable<RealReg>>,
        _fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
    ) -> SmallVec<[Inst; 16]> {
        let mut insts = SmallVec::new();
        let clobbered_vec = get_callee_saves(clobbers);
--- a/cranelift/codegen/src/isa/arm32/mod.rs
+++ b/cranelift/codegen/src/isa/arm32/mod.rs
@@ -7,7 +7,7 @@ use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter,
 use crate::result::CodegenResult;
 use crate::settings;

-use alloc::boxed::Box;
+use alloc::{boxed::Box, vec::Vec};
 use core::hash::{Hash, Hasher};
 use regalloc::{PrettyPrint, RealRegUniverse};
 use target_lexicon::{Architecture, ArmArchitecture, Triple};
@@ -92,6 +92,10 @@ impl MachBackend for Arm32Backend {
        &self.flags
    }

+    fn isa_flags(&self) -> Vec<settings::Value> {
+        Vec::new()
+    }
+
    fn hash_all_flags(&self, mut hasher: &mut dyn Hasher) {
        self.flags.hash(&mut hasher);
    }
--- a/cranelift/codegen/src/isa/call_conv.rs
+++ b/cranelift/codegen/src/isa/call_conv.rs
@@ -10,23 +10,35 @@ use serde::{Deserialize, Serialize};
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
 pub enum CallConv {
-    /// Best performance, not ABI-stable
+    /// Best performance, not ABI-stable.
    Fast,
-    /// Smallest caller code size, not ABI-stable
+    /// Smallest caller code size, not ABI-stable.
    Cold,
-    /// System V-style convention used on many platforms
+    /// System V-style convention used on many platforms.
    SystemV,
-    /// Windows "fastcall" convention, also used for x64 and ARM
+    /// Windows "fastcall" convention, also used for x64 and ARM.
    WindowsFastcall,
-    /// SpiderMonkey WebAssembly convention on systems using natively SystemV
+    /// Mac aarch64 calling convention, which is a tweak aarch64 ABI.
+    AppleAarch64,
+    /// SpiderMonkey WebAssembly convention on systems using natively SystemV.
    BaldrdashSystemV,
-    /// SpiderMonkey WebAssembly convention on Windows
+    /// SpiderMonkey WebAssembly convention on Windows.
    BaldrdashWindows,
    /// SpiderMonkey WebAssembly convention for "ABI-2020", with extra TLS
    /// register slots in the frame.
    Baldrdash2020,
-    /// Specialized convention for the probestack function
+    /// Specialized convention for the probestack function.
    Probestack,
+    /// Wasmtime equivalent of SystemV, not ABI-stable.
+    ///
+    /// Currently only differs in how multiple return values are handled,
+    /// returning the first return value in a register and everything else
+    /// through a return-pointer.
+    WasmtimeSystemV,
+    /// Wasmtime equivalent of WindowsFastcall, not ABI-stable.
+    ///
+    /// Differs from fastcall in the same way as `WasmtimeSystemV`.
+    WasmtimeFastcall,
 }

 impl CallConv {
@@ -36,6 +48,7 @@ impl CallConv {
            // Default to System V for unknown targets because most everything
            // uses System V.
            Ok(CallingConvention::SystemV) | Err(()) => Self::SystemV,
+            Ok(CallingConvention::AppleAarch64) => Self::AppleAarch64,
            Ok(CallingConvention::WindowsFastcall) => Self::WindowsFastcall,
            Ok(unimp) => unimplemented!("calling convention: {:?}", unimp),
        }
@@ -49,6 +62,7 @@ impl CallConv {
            LibcallCallConv::Cold => Self::Cold,
            LibcallCallConv::SystemV => Self::SystemV,
            LibcallCallConv::WindowsFastcall => Self::WindowsFastcall,
+            LibcallCallConv::AppleAarch64 => Self::AppleAarch64,
            LibcallCallConv::BaldrdashSystemV => Self::BaldrdashSystemV,
            LibcallCallConv::BaldrdashWindows => Self::BaldrdashWindows,
            LibcallCallConv::Baldrdash2020 => Self::Baldrdash2020,
@@ -59,7 +73,7 @@ impl CallConv {
    /// Is the calling convention extending the Windows Fastcall ABI?
    pub fn extends_windows_fastcall(self) -> bool {
        match self {
-            Self::WindowsFastcall | Self::BaldrdashWindows => true,
+            Self::WindowsFastcall | Self::BaldrdashWindows | Self::WasmtimeFastcall => true,
            _ => false,
        }
    }
@@ -71,6 +85,14 @@ impl CallConv {
            _ => false,
        }
    }
+
+    /// Is the calling convention extending the Wasmtime ABI?
+    pub fn extends_wasmtime(self) -> bool {
+        match self {
+            Self::WasmtimeSystemV | Self::WasmtimeFastcall => true,
+            _ => false,
+        }
+    }
 }

 impl fmt::Display for CallConv {
@@ -80,10 +102,13 @@ impl fmt::Display for CallConv {
            Self::Cold => "cold",
            Self::SystemV => "system_v",
            Self::WindowsFastcall => "windows_fastcall",
+            Self::AppleAarch64 => "apple_aarch64",
            Self::BaldrdashSystemV => "baldrdash_system_v",
            Self::BaldrdashWindows => "baldrdash_windows",
            Self::Baldrdash2020 => "baldrdash_2020",
            Self::Probestack => "probestack",
+            Self::WasmtimeSystemV => "wasmtime_system_v",
+            Self::WasmtimeFastcall => "wasmtime_fastcall",
        })
    }
 }
@@ -96,10 +121,13 @@ impl str::FromStr for CallConv {
            "cold" => Ok(Self::Cold),
            "system_v" => Ok(Self::SystemV),
            "windows_fastcall" => Ok(Self::WindowsFastcall),
+            "apple_aarch64" => Ok(Self::AppleAarch64),
            "baldrdash_system_v" => Ok(Self::BaldrdashSystemV),
            "baldrdash_windows" => Ok(Self::BaldrdashWindows),
            "baldrdash_2020" => Ok(Self::Baldrdash2020),
            "probestack" => Ok(Self::Probestack),
+            "wasmtime_system_v" => Ok(Self::WasmtimeSystemV),
+            "wasmtime_fastcall" => Ok(Self::WasmtimeFastcall),
            _ => Err(()),
        }
    }
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -57,35 +57,34 @@ use crate::flowgraph;
 use crate::ir;
 #[cfg(feature = "unwind")]
 use crate::isa::unwind::systemv::RegisterMappingError;
-use crate::machinst::MachBackend;
+use crate::machinst::{MachBackend, UnwindInfoKind};
 use crate::regalloc;
 use crate::result::CodegenResult;
 use crate::settings;
 use crate::settings::SetResult;
 use crate::timing;
-use alloc::borrow::Cow;
-use alloc::boxed::Box;
+use alloc::{borrow::Cow, boxed::Box, vec::Vec};
 use core::any::Any;
 use core::fmt;
 use core::fmt::{Debug, Formatter};
 use core::hash::Hasher;
-use target_lexicon::{triple, Architecture, PointerWidth, Triple};
-use thiserror::Error;
+use target_lexicon::{triple, Architecture, OperatingSystem, PointerWidth, Triple};

 #[cfg(feature = "riscv")]
 mod riscv;

-// N.B.: the old x86-64 backend (`x86`) and the new one (`x64`) can both be
-// included; if the new backend is included, then it is the default backend
-// returned for an x86-64 triple, but a specific option can request the old
-// backend. It is important to have the ability to instantiate *both* backends
-// in the same build so that we can do things like differential fuzzing between
-// backends, or perhaps offer a runtime configuration flag in the future.
+// N.B.: the old x86-64 backend (`x86`) and the new one (`x64`) are both
+// included whenever building with x86 support. The new backend is the default,
+// but the old can be requested with `BackendVariant::Legacy`. However, if this
+// crate is built with the `old-x86-backend` feature, then the old backend is
+// default instead.
 #[cfg(feature = "x86")]
 mod x86;

-#[cfg(feature = "x64")]
-mod x64;
+// This module is made public here for benchmarking purposes. No guarantees are
+// made regarding API stability.
+#[cfg(feature = "x86")]
+pub mod x64;

 #[cfg(feature = "arm32")]
 mod arm32;
@@ -93,6 +92,9 @@ mod arm32;
 #[cfg(feature = "arm64")]
 pub(crate) mod aarch64;

+#[cfg(feature = "s390x")]
+mod s390x;
+
 pub mod unwind;

 mod call_conv;
@@ -123,7 +125,7 @@ macro_rules! isa_builder {
 /// The "variant" for a given target. On one platform (x86-64), we have two
 /// backends, the "old" and "new" one; the new one is the default if included
 /// in the build configuration and not otherwise specified.
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Debug)]
 pub enum BackendVariant {
    /// Any backend available.
    Any,
@@ -150,18 +152,19 @@ pub fn lookup_variant(triple: Triple, variant: BackendVariant) -> Result<Builder
            isa_builder!(x86, (feature = "x86"), triple)
        }
        (Architecture::X86_64, BackendVariant::MachInst) => {
-            isa_builder!(x64, (feature = "x64"), triple)
+            isa_builder!(x64, (feature = "x86"), triple)
        }
-        #[cfg(feature = "x64")]
+        #[cfg(not(feature = "old-x86-backend"))]
        (Architecture::X86_64, BackendVariant::Any) => {
-            isa_builder!(x64, (feature = "x64"), triple)
+            isa_builder!(x64, (feature = "x86"), triple)
        }
-        #[cfg(not(feature = "x64"))]
+        #[cfg(feature = "old-x86-backend")]
        (Architecture::X86_64, BackendVariant::Any) => {
            isa_builder!(x86, (feature = "x86"), triple)
        }
        (Architecture::Arm { .. }, _) => isa_builder!(arm32, (feature = "arm32"), triple),
        (Architecture::Aarch64 { .. }, _) => isa_builder!(aarch64, (feature = "arm64"), triple),
+        (Architecture::S390x { .. }, _) => isa_builder!(s390x, (feature = "s390x"), triple),
        _ => Err(LookupError::Unsupported),
    }
 }
@@ -180,17 +183,30 @@ pub fn lookup_by_name(name: &str) -> Result<Builder, LookupError> {
 }

 /// Describes reason for target lookup failure
-#[derive(Error, PartialEq, Eq, Copy, Clone, Debug)]
+#[derive(PartialEq, Eq, Copy, Clone, Debug)]
 pub enum LookupError {
    /// Support for this target was disabled in the current build.
-    #[error("Support for this target is disabled")]
    SupportDisabled,

    /// Support for this target has not yet been implemented.
-    #[error("Support for this target has not been implemented yet")]
    Unsupported,
 }

+// This is manually implementing Error and Display instead of using thiserror to reduce the amount
+// of dependencies used by Cranelift.
+impl std::error::Error for LookupError {}
+
+impl fmt::Display for LookupError {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        match self {
+            LookupError::SupportDisabled => write!(f, "Support for this target is disabled"),
+            LookupError::Unsupported => {
+                write!(f, "Support for this target has not been implemented yet")
+            }
+        }
+    }
+}
+
 /// Builder for a `TargetIsa`.
 /// Modify the ISA-specific settings before creating the `TargetIsa` trait object with `finish`.
 #[derive(Clone)]
@@ -201,6 +217,16 @@ pub struct Builder {
 }

 impl Builder {
+    /// Gets the triple for the builder.
+    pub fn triple(&self) -> &Triple {
+        &self.triple
+    }
+
+    /// Iterates the available settings in the builder.
+    pub fn iter(&self) -> impl Iterator<Item = settings::Setting> {
+        self.setup.iter()
+    }
+
    /// Combine the ISA-specific settings with the provided ISA-independent settings and allocate a
    /// fully configured `TargetIsa` trait object.
    pub fn finish(self, shared_flags: settings::Flags) -> Box<dyn TargetIsa> {
@@ -265,6 +291,14 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
    /// Get the ISA-independent flags that were used to make this trait object.
    fn flags(&self) -> &settings::Flags;

+    /// Get the ISA-dependent flag values that were used to make this trait object.
+    fn isa_flags(&self) -> Vec<settings::Value>;
+
+    /// Get the variant of this ISA (Legacy or MachInst).
+    fn variant(&self) -> BackendVariant {
+        BackendVariant::Legacy
+    }
+
    /// Hashes all flags, both ISA-independent and ISA-specific, into the
    /// specified hasher.
    fn hash_all_flags(&self, hasher: &mut dyn Hasher);
@@ -460,6 +494,18 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
    /// IntCC condition for Unsigned Subtraction Overflow (Borrow/Carry).
    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC;

+    /// Returns the flavor of unwind information emitted for this target.
+    fn unwind_info_kind(&self) -> UnwindInfoKind {
+        match self.triple().operating_system {
+            #[cfg(feature = "unwind")]
+            OperatingSystem::Windows => UnwindInfoKind::Windows,
+            #[cfg(feature = "unwind")]
+            _ => UnwindInfoKind::SystemV,
+            #[cfg(not(feature = "unwind"))]
+            _ => UnwindInfoKind::None,
+        }
+    }
+
    /// Creates unwind information for the function.
    ///
    /// Returns `None` if there is no unwind information for the function.
--- a/cranelift/codegen/src/isa/riscv/mod.rs
+++ b/cranelift/codegen/src/isa/riscv/mod.rs
@@ -15,8 +15,7 @@ use crate::isa::enc_tables::{self as shared_enc_tables, lookup_enclist, Encoding
 use crate::isa::Builder as IsaBuilder;
 use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
 use crate::regalloc;
-use alloc::borrow::Cow;
-use alloc::boxed::Box;
+use alloc::{borrow::Cow, boxed::Box, vec::Vec};
 use core::any::Any;
 use core::fmt;
 use core::hash::{Hash, Hasher};
@@ -70,6 +69,10 @@ impl TargetIsa for Isa {
        &self.shared_flags
    }

+    fn isa_flags(&self) -> Vec<shared_settings::Value> {
+        self.isa_flags.iter().collect()
+    }
+
    fn hash_all_flags(&self, mut hasher: &mut dyn Hasher) {
        self.shared_flags.hash(&mut hasher);
        self.isa_flags.hash(&mut hasher);
--- a/cranelift/codegen/src/isa/riscv/settings.rs
+++ b/cranelift/codegen/src/isa/riscv/settings.rs
@@ -1,6 +1,6 @@
 //! RISC-V Settings.

-use crate::settings::{self, detail, Builder};
+use crate::settings::{self, detail, Builder, Value};
 use core::fmt;

 // Include code generated by `cranelift-codegen/meta/src/gen_settings.rs`. This file contains a
--- a/cranelift/codegen/src/isa/s390x/abi.rs
+++ b/cranelift/codegen/src/isa/s390x/abi.rs
@@ -0,0 +1,770 @@
+//! Implementation of a standard S390x ABI.
+//!
+//! This machine uses the "vanilla" ABI implementation from abi_impl.rs,
+//! however a few details are different from the description there:
+//!
+//! - On s390x, the caller must provide a "register save area" of 160
+//!   bytes to any function it calls.  The called function is free to use
+//!   this space for any purpose; usually to save callee-saved GPRs.
+//!   (Note that while this area is allocated by the caller, it is counted
+//!   as part of the callee's stack frame; in particular, the callee's CFA
+//!   is the top of the register save area, not the incoming SP value.)
+//!
+//! - Overflow arguments are passed on the stack starting immediately
+//!   above the register save area.  On s390x, this space is allocated
+//!   only once directly in the prologue, using a size large enough to
+//!   hold overflow arguments for every call in the function.
+//!
+//! - On s390x we do not use a frame pointer register; instead, every
+//!   element of the stack frame is addressed via (constant) offsets
+//!   from the stack pointer.  Note that due to the above (and because
+//!   there are no variable-sized stack allocations in cranelift), the
+//!   value of the stack pointer register never changes after the
+//!   initial allocation in the function prologue.
+//!
+//! Overall, the stack frame layout on s390x is as follows:
+//!
+//! ```plain
+//!   (high address)
+//!
+//!                              +---------------------------+
+//!                              |          ...              |
+//! CFA                  ----->  | stack args                |
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | 160 bytes reg save area   |
+//! SP at function entry ----->  | (used to save GPRs)       |
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | clobbered callee-saves    |
+//!                              | (used to save FPRs)       |
+//! unwind-frame base     ---->  | (alloc'd by prologue)     |
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | spill slots               |
+//!                              | (accessed via nominal SP) |
+//!                              |          ...              |
+//!                              | stack slots               |
+//!                              | (accessed via nominal SP) |
+//! nominal SP --------------->  | (alloc'd by prologue)     |
+//!                              +---------------------------+
+//!                              |          ...              |
+//!                              | args for call             |
+//!                              | outgoing reg save area    |
+//! SP during function  ------>  | (alloc'd by prologue)     |
+//!                              +---------------------------+
+//!
+//!   (low address)
+//! ```
+
+use crate::ir;
+use crate::ir::condcodes::IntCC;
+use crate::ir::types;
+use crate::ir::MemFlags;
+use crate::ir::Type;
+use crate::isa;
+use crate::isa::s390x::inst::*;
+use crate::isa::unwind::UnwindInst;
+use crate::machinst::*;
+use crate::settings;
+use crate::{CodegenError, CodegenResult};
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use regalloc::{RealReg, Reg, RegClass, Set, Writable};
+use smallvec::{smallvec, SmallVec};
+use std::convert::TryFrom;
+
+// We use a generic implementation that factors out ABI commonalities.
+
+/// Support for the S390x ABI from the callee side (within a function body).
+pub type S390xABICallee = ABICalleeImpl<S390xMachineDeps>;
+
+/// Support for the S390x ABI from the caller side (at a callsite).
+pub type S390xABICaller = ABICallerImpl<S390xMachineDeps>;
+
+/// ABI Register usage
+
+fn in_int_reg(ty: Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true,
+        types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
+        _ => false,
+    }
+}
+
+fn in_flt_reg(ty: Type) -> bool {
+    match ty {
+        types::F32 | types::F64 => true,
+        _ => false,
+    }
+}
+
+fn get_intreg_for_arg(idx: usize) -> Option<Reg> {
+    match idx {
+        0 => Some(regs::gpr(2)),
+        1 => Some(regs::gpr(3)),
+        2 => Some(regs::gpr(4)),
+        3 => Some(regs::gpr(5)),
+        4 => Some(regs::gpr(6)),
+        _ => None,
+    }
+}
+
+fn get_fltreg_for_arg(idx: usize) -> Option<Reg> {
+    match idx {
+        0 => Some(regs::fpr(0)),
+        1 => Some(regs::fpr(2)),
+        2 => Some(regs::fpr(4)),
+        3 => Some(regs::fpr(6)),
+        _ => None,
+    }
+}
+
+fn get_intreg_for_ret(idx: usize) -> Option<Reg> {
+    match idx {
+        0 => Some(regs::gpr(2)),
+        // ABI extension to support multi-value returns:
+        1 => Some(regs::gpr(3)),
+        2 => Some(regs::gpr(4)),
+        3 => Some(regs::gpr(5)),
+        _ => None,
+    }
+}
+
+fn get_fltreg_for_ret(idx: usize) -> Option<Reg> {
+    match idx {
+        0 => Some(regs::fpr(0)),
+        // ABI extension to support multi-value returns:
+        1 => Some(regs::fpr(2)),
+        2 => Some(regs::fpr(4)),
+        3 => Some(regs::fpr(6)),
+        _ => None,
+    }
+}
+
+/// This is the limit for the size of argument and return-value areas on the
+/// stack. We place a reasonable limit here to avoid integer overflow issues
+/// with 32-bit arithmetic: for now, 128 MB.
+static STACK_ARG_RET_SIZE_LIMIT: u64 = 128 * 1024 * 1024;
+
+impl Into<MemArg> for StackAMode {
+    fn into(self) -> MemArg {
+        match self {
+            StackAMode::FPOffset(off, _ty) => MemArg::InitialSPOffset { off },
+            StackAMode::NominalSPOffset(off, _ty) => MemArg::NominalSPOffset { off },
+            StackAMode::SPOffset(off, _ty) => {
+                MemArg::reg_plus_off(stack_reg(), off, MemFlags::trusted())
+            }
+        }
+    }
+}
+
+/// S390x-specific ABI behavior. This struct just serves as an implementation
+/// point for the trait; it is never actually instantiated.
+pub struct S390xMachineDeps;
+
+impl ABIMachineSpec for S390xMachineDeps {
+    type I = Inst;
+
+    fn word_bits() -> u32 {
+        64
+    }
+
+    /// Return required stack alignment in bytes.
+    fn stack_align(_call_conv: isa::CallConv) -> u32 {
+        8
+    }
+
+    fn compute_arg_locs(
+        call_conv: isa::CallConv,
+        _flags: &settings::Flags,
+        params: &[ir::AbiParam],
+        args_or_rets: ArgsOrRets,
+        add_ret_area_ptr: bool,
+    ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
+        let mut next_gpr = 0;
+        let mut next_fpr = 0;
+        let mut next_stack: u64 = 0;
+        let mut ret = vec![];
+
+        if args_or_rets == ArgsOrRets::Args {
+            next_stack = 160;
+        }
+
+        for i in 0..params.len() {
+            let param = &params[i];
+
+            // Validate "purpose".
+            match &param.purpose {
+                &ir::ArgumentPurpose::VMContext
+                | &ir::ArgumentPurpose::Normal
+                | &ir::ArgumentPurpose::StackLimit
+                | &ir::ArgumentPurpose::SignatureId => {}
+                _ => panic!(
+                    "Unsupported argument purpose {:?} in signature: {:?}",
+                    param.purpose, params
+                ),
+            }
+
+            let intreg = in_int_reg(param.value_type);
+            let fltreg = in_flt_reg(param.value_type);
+            debug_assert!(intreg || fltreg);
+            debug_assert!(!(intreg && fltreg));
+
+            let (next_reg, candidate) = if intreg {
+                let candidate = match args_or_rets {
+                    ArgsOrRets::Args => get_intreg_for_arg(next_gpr),
+                    ArgsOrRets::Rets => get_intreg_for_ret(next_gpr),
+                };
+                (&mut next_gpr, candidate)
+            } else {
+                let candidate = match args_or_rets {
+                    ArgsOrRets::Args => get_fltreg_for_arg(next_fpr),
+                    ArgsOrRets::Rets => get_fltreg_for_ret(next_fpr),
+                };
+                (&mut next_fpr, candidate)
+            };
+
+            // In the Wasmtime ABI only the first return value can be in a register.
+            let candidate =
+                if call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets && i > 0 {
+                    None
+                } else {
+                    candidate
+                };
+
+            if let Some(reg) = candidate {
+                ret.push(ABIArg::reg(
+                    reg.to_real_reg(),
+                    param.value_type,
+                    param.extension,
+                    param.purpose,
+                ));
+                *next_reg += 1;
+            } else {
+                // Compute size. Every argument or return value takes a slot of
+                // at least 8 bytes, except for return values in the Wasmtime ABI.
+                let size = (ty_bits(param.value_type) / 8) as u64;
+                let slot_size = if call_conv.extends_wasmtime() && args_or_rets == ArgsOrRets::Rets
+                {
+                    size
+                } else {
+                    std::cmp::max(size, 8)
+                };
+
+                // Align the stack slot.
+                debug_assert!(slot_size.is_power_of_two());
+                next_stack = align_to(next_stack, slot_size);
+
+                // If the type is actually of smaller size (and the argument
+                // was not extended), it is passed right-aligned.
+                let offset = if size < slot_size && param.extension == ir::ArgumentExtension::None {
+                    slot_size - size
+                } else {
+                    0
+                };
+                ret.push(ABIArg::stack(
+                    (next_stack + offset) as i64,
+                    param.value_type,
+                    param.extension,
+                    param.purpose,
+                ));
+                next_stack += slot_size;
+            }
+        }
+
+        next_stack = align_to(next_stack, 8);
+
+        let extra_arg = if add_ret_area_ptr {
+            debug_assert!(args_or_rets == ArgsOrRets::Args);
+            if let Some(reg) = get_intreg_for_arg(next_gpr) {
+                ret.push(ABIArg::reg(
+                    reg.to_real_reg(),
+                    types::I64,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+            } else {
+                ret.push(ABIArg::stack(
+                    next_stack as i64,
+                    types::I64,
+                    ir::ArgumentExtension::None,
+                    ir::ArgumentPurpose::Normal,
+                ));
+                next_stack += 8;
+            }
+            Some(ret.len() - 1)
+        } else {
+            None
+        };
+
+        // To avoid overflow issues, limit the arg/return size to something
+        // reasonable -- here, 128 MB.
+        if next_stack > STACK_ARG_RET_SIZE_LIMIT {
+            return Err(CodegenError::ImplLimitExceeded);
+        }
+
+        Ok((ret, next_stack as i64, extra_arg))
+    }
+
+    fn fp_to_arg_offset(_call_conv: isa::CallConv, _flags: &settings::Flags) -> i64 {
+        0
+    }
+
+    fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Inst {
+        Inst::gen_load(into_reg, mem.into(), ty)
+    }
+
+    fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Inst {
+        Inst::gen_store(mem.into(), from_reg, ty)
+    }
+
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst {
+        Inst::gen_move(to_reg, from_reg, ty)
+    }
+
+    fn gen_extend(
+        to_reg: Writable<Reg>,
+        from_reg: Reg,
+        signed: bool,
+        from_bits: u8,
+        to_bits: u8,
+    ) -> Inst {
+        assert!(from_bits < to_bits);
+        Inst::Extend {
+            rd: to_reg,
+            rn: from_reg,
+            signed,
+            from_bits,
+            to_bits,
+        }
+    }
+
+    fn gen_ret() -> Inst {
+        Inst::Ret { link: gpr(14) }
+    }
+
+    fn gen_add_imm(into_reg: Writable<Reg>, from_reg: Reg, imm: u32) -> SmallInstVec<Inst> {
+        let mut insts = SmallVec::new();
+        if let Some(imm) = UImm12::maybe_from_u64(imm as u64) {
+            insts.push(Inst::LoadAddr {
+                rd: into_reg,
+                mem: MemArg::BXD12 {
+                    base: from_reg,
+                    index: zero_reg(),
+                    disp: imm,
+                    flags: MemFlags::trusted(),
+                },
+            });
+        } else if let Some(imm) = SImm20::maybe_from_i64(imm as i64) {
+            insts.push(Inst::LoadAddr {
+                rd: into_reg,
+                mem: MemArg::BXD20 {
+                    base: from_reg,
+                    index: zero_reg(),
+                    disp: imm,
+                    flags: MemFlags::trusted(),
+                },
+            });
+        } else {
+            if from_reg != into_reg.to_reg() {
+                insts.push(Inst::mov64(into_reg, from_reg));
+            }
+            insts.push(Inst::AluRUImm32 {
+                alu_op: ALUOp::Add64,
+                rd: into_reg,
+                imm,
+            });
+        }
+        insts
+    }
+
+    fn gen_stack_lower_bound_trap(limit_reg: Reg) -> SmallInstVec<Inst> {
+        let mut insts = SmallVec::new();
+        insts.push(Inst::CmpTrapRR {
+            op: CmpOp::CmpL64,
+            rn: stack_reg(),
+            rm: limit_reg,
+            cond: Cond::from_intcc(IntCC::UnsignedLessThanOrEqual),
+            trap_code: ir::TrapCode::StackOverflow,
+        });
+        insts
+    }
+
+    fn gen_epilogue_placeholder() -> Inst {
+        Inst::EpiloguePlaceholder
+    }
+
+    fn gen_get_stack_addr(mem: StackAMode, into_reg: Writable<Reg>, _ty: Type) -> Inst {
+        let mem = mem.into();
+        Inst::LoadAddr { rd: into_reg, mem }
+    }
+
+    fn get_stacklimit_reg() -> Reg {
+        spilltmp_reg()
+    }
+
+    fn gen_load_base_offset(into_reg: Writable<Reg>, base: Reg, offset: i32, ty: Type) -> Inst {
+        let mem = MemArg::reg_plus_off(base, offset.into(), MemFlags::trusted());
+        Inst::gen_load(into_reg, mem, ty)
+    }
+
+    fn gen_store_base_offset(base: Reg, offset: i32, from_reg: Reg, ty: Type) -> Inst {
+        let mem = MemArg::reg_plus_off(base, offset.into(), MemFlags::trusted());
+        Inst::gen_store(mem, from_reg, ty)
+    }
+
+    fn gen_sp_reg_adjust(imm: i32) -> SmallInstVec<Inst> {
+        if imm == 0 {
+            return SmallVec::new();
+        }
+
+        let mut insts = SmallVec::new();
+        if let Ok(imm) = i16::try_from(imm) {
+            insts.push(Inst::AluRSImm16 {
+                alu_op: ALUOp::Add64,
+                rd: writable_stack_reg(),
+                imm,
+            });
+        } else {
+            insts.push(Inst::AluRSImm32 {
+                alu_op: ALUOp::Add64,
+                rd: writable_stack_reg(),
+                imm,
+            });
+        }
+        insts
+    }
+
+    fn gen_nominal_sp_adj(offset: i32) -> Inst {
+        Inst::VirtualSPOffsetAdj {
+            offset: offset.into(),
+        }
+    }
+
+    fn gen_prologue_frame_setup(_flags: &settings::Flags) -> SmallInstVec<Inst> {
+        SmallVec::new()
+    }
+
+    fn gen_epilogue_frame_restore(_flags: &settings::Flags) -> SmallInstVec<Inst> {
+        SmallVec::new()
+    }
+
+    fn gen_probestack(_: u32) -> SmallInstVec<Self::I> {
+        // TODO: implement if we ever require stack probes on an s390x host
+        // (unlikely unless Lucet is ported)
+        smallvec![]
+    }
+
+    // Returns stack bytes used as well as instructions. Does not adjust
+    // nominal SP offset; abi_impl generic code will do that.
+    fn gen_clobber_save(
+        call_conv: isa::CallConv,
+        flags: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        fixed_frame_storage_size: u32,
+        outgoing_args_size: u32,
+    ) -> (u64, SmallVec<[Inst; 16]>) {
+        let mut insts = SmallVec::new();
+
+        // Collect clobbered registers.
+        let (clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers);
+        let mut first_clobbered_gpr = 16;
+        for reg in clobbered_gpr {
+            let enc = reg.to_reg().get_hw_encoding();
+            if enc < first_clobbered_gpr {
+                first_clobbered_gpr = enc;
+            }
+        }
+        let clobber_size = clobbered_fpr.len() * 8;
+        if flags.unwind_info() {
+            insts.push(Inst::Unwind {
+                inst: UnwindInst::DefineNewFrame {
+                    offset_upward_to_caller_sp: 160,
+                    offset_downward_to_clobbers: clobber_size as u32,
+                },
+            });
+        }
+
+        // Use STMG to save clobbered GPRs into save area.
+        if first_clobbered_gpr < 16 {
+            let offset = 8 * first_clobbered_gpr as i64;
+            insts.push(Inst::StoreMultiple64 {
+                rt: gpr(first_clobbered_gpr as u8),
+                rt2: gpr(15),
+                addr_reg: stack_reg(),
+                addr_off: SImm20::maybe_from_i64(offset).unwrap(),
+            });
+        }
+        if flags.unwind_info() {
+            for i in first_clobbered_gpr..16 {
+                insts.push(Inst::Unwind {
+                    inst: UnwindInst::SaveReg {
+                        clobber_offset: clobber_size as u32 + (i * 8) as u32,
+                        reg: gpr(i as u8).to_real_reg(),
+                    },
+                });
+            }
+        }
+
+        // Decrement stack pointer.
+        let stack_size =
+            outgoing_args_size as i32 + clobber_size as i32 + fixed_frame_storage_size as i32;
+        insts.extend(Self::gen_sp_reg_adjust(-stack_size));
+        if flags.unwind_info() {
+            insts.push(Inst::Unwind {
+                inst: UnwindInst::StackAlloc {
+                    size: stack_size as u32,
+                },
+            });
+        }
+
+        let sp_adj = outgoing_args_size as i32;
+        if sp_adj > 0 {
+            insts.push(Self::gen_nominal_sp_adj(sp_adj));
+        }
+
+        // Save FPRs.
+        for (i, reg) in clobbered_fpr.iter().enumerate() {
+            insts.push(Inst::FpuStore64 {
+                rd: reg.to_reg().to_reg(),
+                mem: MemArg::reg_plus_off(
+                    stack_reg(),
+                    (i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64,
+                    MemFlags::trusted(),
+                ),
+            });
+            if flags.unwind_info() {
+                insts.push(Inst::Unwind {
+                    inst: UnwindInst::SaveReg {
+                        clobber_offset: (i * 8) as u32,
+                        reg: reg.to_reg(),
+                    },
+                });
+            }
+        }
+
+        (clobber_size as u64, insts)
+    }
+
+    fn gen_clobber_restore(
+        call_conv: isa::CallConv,
+        _: &settings::Flags,
+        clobbers: &Set<Writable<RealReg>>,
+        fixed_frame_storage_size: u32,
+        outgoing_args_size: u32,
+    ) -> SmallVec<[Inst; 16]> {
+        let mut insts = SmallVec::new();
+
+        // Collect clobbered registers.
+        let (clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers);
+        let mut first_clobbered_gpr = 16;
+        for reg in clobbered_gpr {
+            let enc = reg.to_reg().get_hw_encoding();
+            if enc < first_clobbered_gpr {
+                first_clobbered_gpr = enc;
+            }
+        }
+        let clobber_size = clobbered_fpr.len() * 8;
+
+        // Restore FPRs.
+        for (i, reg) in clobbered_fpr.iter().enumerate() {
+            insts.push(Inst::FpuLoad64 {
+                rd: Writable::from_reg(reg.to_reg().to_reg()),
+                mem: MemArg::reg_plus_off(
+                    stack_reg(),
+                    (i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64,
+                    MemFlags::trusted(),
+                ),
+            });
+        }
+
+        // Increment stack pointer unless it will be restored implicitly.
+        let stack_size =
+            outgoing_args_size as i32 + clobber_size as i32 + fixed_frame_storage_size as i32;
+        let implicit_sp_restore = first_clobbered_gpr < 16
+            && SImm20::maybe_from_i64(8 * first_clobbered_gpr as i64 + stack_size as i64).is_some();
+        if !implicit_sp_restore {
+            insts.extend(Self::gen_sp_reg_adjust(stack_size));
+        }
+
+        // Use LMG to restore clobbered GPRs from save area.
+        if first_clobbered_gpr < 16 {
+            let mut offset = 8 * first_clobbered_gpr as i64;
+            if implicit_sp_restore {
+                offset += stack_size as i64;
+            }
+            insts.push(Inst::LoadMultiple64 {
+                rt: writable_gpr(first_clobbered_gpr as u8),
+                rt2: writable_gpr(15),
+                addr_reg: stack_reg(),
+                addr_off: SImm20::maybe_from_i64(offset).unwrap(),
+            });
+        }
+
+        insts
+    }
+
+    fn gen_call(
+        dest: &CallDest,
+        uses: Vec<Reg>,
+        defs: Vec<Writable<Reg>>,
+        opcode: ir::Opcode,
+        tmp: Writable<Reg>,
+        _callee_conv: isa::CallConv,
+        _caller_conv: isa::CallConv,
+    ) -> SmallVec<[(InstIsSafepoint, Inst); 2]> {
+        let mut insts = SmallVec::new();
+        match &dest {
+            &CallDest::ExtName(ref name, RelocDistance::Near) => insts.push((
+                InstIsSafepoint::Yes,
+                Inst::Call {
+                    link: writable_gpr(14),
+                    info: Box::new(CallInfo {
+                        dest: name.clone(),
+                        uses,
+                        defs,
+                        opcode,
+                    }),
+                },
+            )),
+            &CallDest::ExtName(ref name, RelocDistance::Far) => {
+                insts.push((
+                    InstIsSafepoint::No,
+                    Inst::LoadExtNameFar {
+                        rd: tmp,
+                        name: Box::new(name.clone()),
+                        offset: 0,
+                    },
+                ));
+                insts.push((
+                    InstIsSafepoint::Yes,
+                    Inst::CallInd {
+                        link: writable_gpr(14),
+                        info: Box::new(CallIndInfo {
+                            rn: tmp.to_reg(),
+                            uses,
+                            defs,
+                            opcode,
+                        }),
+                    },
+                ));
+            }
+            &CallDest::Reg(reg) => insts.push((
+                InstIsSafepoint::Yes,
+                Inst::CallInd {
+                    link: writable_gpr(14),
+                    info: Box::new(CallIndInfo {
+                        rn: *reg,
+                        uses,
+                        defs,
+                        opcode,
+                    }),
+                },
+            )),
+        }
+
+        insts
+    }
+
+    fn gen_memcpy(
+        _call_conv: isa::CallConv,
+        _dst: Reg,
+        _src: Reg,
+        _size: usize,
+    ) -> SmallVec<[Self::I; 8]> {
+        unimplemented!("StructArgs not implemented for S390X yet");
+    }
+
+    fn get_number_of_spillslots_for_value(rc: RegClass, ty: Type) -> u32 {
+        // We allocate in terms of 8-byte slots.
+        match (rc, ty) {
+            (RegClass::I64, _) => 1,
+            (RegClass::F64, _) => 1,
+            _ => panic!("Unexpected register class!"),
+        }
+    }
+
+    /// Get the current virtual-SP offset from an instruction-emission state.
+    fn get_virtual_sp_offset_from_state(s: &EmitState) -> i64 {
+        s.virtual_sp_offset
+    }
+
+    /// Get the nominal-SP-to-FP offset from an instruction-emission state.
+    fn get_nominal_sp_to_fp(s: &EmitState) -> i64 {
+        s.initial_sp_offset
+    }
+
+    fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> Vec<Writable<Reg>> {
+        let mut caller_saved = Vec::new();
+        for i in 0..15 {
+            let x = writable_gpr(i);
+            if is_reg_clobbered_by_call(call_conv_of_callee, x.to_reg().to_real_reg()) {
+                caller_saved.push(x);
+            }
+        }
+        for i in 0..15 {
+            let v = writable_fpr(i);
+            if is_reg_clobbered_by_call(call_conv_of_callee, v.to_reg().to_real_reg()) {
+                caller_saved.push(v);
+            }
+        }
+        caller_saved
+    }
+
+    fn get_ext_mode(
+        _call_conv: isa::CallConv,
+        specified: ir::ArgumentExtension,
+    ) -> ir::ArgumentExtension {
+        specified
+    }
+}
+
+fn is_reg_saved_in_prologue(_call_conv: isa::CallConv, r: RealReg) -> bool {
+    match r.get_class() {
+        RegClass::I64 => {
+            // r6 - r15 inclusive are callee-saves.
+            r.get_hw_encoding() >= 6 && r.get_hw_encoding() <= 15
+        }
+        RegClass::F64 => {
+            // f8 - f15 inclusive are callee-saves.
+            r.get_hw_encoding() >= 8 && r.get_hw_encoding() <= 15
+        }
+        _ => panic!("Unexpected RegClass"),
+    }
+}
+
+fn get_regs_saved_in_prologue(
+    call_conv: isa::CallConv,
+    regs: &Set<Writable<RealReg>>,
+) -> (Vec<Writable<RealReg>>, Vec<Writable<RealReg>>) {
+    let mut int_saves = vec![];
+    let mut fpr_saves = vec![];
+    for &reg in regs.iter() {
+        if is_reg_saved_in_prologue(call_conv, reg.to_reg()) {
+            match reg.to_reg().get_class() {
+                RegClass::I64 => int_saves.push(reg),
+                RegClass::F64 => fpr_saves.push(reg),
+                _ => panic!("Unexpected RegClass"),
+            }
+        }
+    }
+    // Sort registers for deterministic code output.
+    int_saves.sort_by_key(|r| r.to_reg().get_index());
+    fpr_saves.sort_by_key(|r| r.to_reg().get_index());
+    (int_saves, fpr_saves)
+}
+
+fn is_reg_clobbered_by_call(_call_conv: isa::CallConv, r: RealReg) -> bool {
+    match r.get_class() {
+        RegClass::I64 => {
+            // r0 - r5 inclusive are caller-saves.
+            r.get_hw_encoding() <= 5
+        }
+        RegClass::F64 => {
+            // f0 - f7 inclusive are caller-saves.
+            r.get_hw_encoding() <= 7
+        }
+        _ => panic!("Unexpected RegClass"),
+    }
+}
--- a/cranelift/codegen/src/isa/s390x/inst/args.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/args.rs
@@ -0,0 +1,317 @@
+//! S390x ISA definitions: instruction arguments.
+
+// Some variants are never constructed, but we still want them as options in the future.
+#![allow(dead_code)]
+
+use crate::ir::condcodes::{FloatCC, IntCC};
+use crate::ir::MemFlags;
+use crate::isa::s390x::inst::*;
+use crate::machinst::MachLabel;
+
+use regalloc::{PrettyPrint, RealRegUniverse, Reg};
+
+use std::string::String;
+
+//=============================================================================
+// Instruction sub-components (memory addresses): definitions
+
+/// A memory argument to load/store, encapsulating the possible addressing modes.
+#[derive(Clone, Debug)]
+pub enum MemArg {
+    //
+    // Real IBM Z addressing modes:
+    //
+    /// Base register, index register, and 12-bit unsigned displacement.
+    BXD12 {
+        base: Reg,
+        index: Reg,
+        disp: UImm12,
+        flags: MemFlags,
+    },
+
+    /// Base register, index register, and 20-bit signed displacement.
+    BXD20 {
+        base: Reg,
+        index: Reg,
+        disp: SImm20,
+        flags: MemFlags,
+    },
+
+    /// PC-relative Reference to a label.
+    Label { target: BranchTarget },
+
+    /// PC-relative Reference to a near symbol.
+    Symbol {
+        name: Box<ExternalName>,
+        offset: i32,
+        flags: MemFlags,
+    },
+
+    //
+    // Virtual addressing modes that are lowered at emission time:
+    //
+    /// Arbitrary offset from a register. Converted to generation of large
+    /// offsets with multiple instructions as necessary during code emission.
+    RegOffset { reg: Reg, off: i64, flags: MemFlags },
+
+    /// Offset from the stack pointer at function entry.
+    InitialSPOffset { off: i64 },
+
+    /// Offset from the "nominal stack pointer", which is where the real SP is
+    /// just after stack and spill slots are allocated in the function prologue.
+    /// At emission time, this is converted to `SPOffset` with a fixup added to
+    /// the offset constant. The fixup is a running value that is tracked as
+    /// emission iterates through instructions in linear order, and can be
+    /// adjusted up and down with [Inst::VirtualSPOffsetAdj].
+    ///
+    /// The standard ABI is in charge of handling this (by emitting the
+    /// adjustment meta-instructions). It maintains the invariant that "nominal
+    /// SP" is where the actual SP is after the function prologue and before
+    /// clobber pushes. See the diagram in the documentation for
+    /// [crate::isa::s390x::abi](the ABI module) for more details.
+    NominalSPOffset { off: i64 },
+}
+
+impl MemArg {
+    /// Memory reference using an address in a register.
+    pub fn reg(reg: Reg, flags: MemFlags) -> MemArg {
+        MemArg::BXD12 {
+            base: reg,
+            index: zero_reg(),
+            disp: UImm12::zero(),
+            flags,
+        }
+    }
+
+    /// Memory reference using the sum of two registers as an address.
+    pub fn reg_plus_reg(reg1: Reg, reg2: Reg, flags: MemFlags) -> MemArg {
+        MemArg::BXD12 {
+            base: reg1,
+            index: reg2,
+            disp: UImm12::zero(),
+            flags,
+        }
+    }
+
+    /// Memory reference using the sum of a register an an offset as address.
+    pub fn reg_plus_off(reg: Reg, off: i64, flags: MemFlags) -> MemArg {
+        MemArg::RegOffset { reg, off, flags }
+    }
+
+    pub(crate) fn get_flags(&self) -> MemFlags {
+        match self {
+            MemArg::BXD12 { flags, .. } => *flags,
+            MemArg::BXD20 { flags, .. } => *flags,
+            MemArg::RegOffset { flags, .. } => *flags,
+            MemArg::Label { .. } => MemFlags::trusted(),
+            MemArg::Symbol { flags, .. } => *flags,
+            MemArg::InitialSPOffset { .. } => MemFlags::trusted(),
+            MemArg::NominalSPOffset { .. } => MemFlags::trusted(),
+        }
+    }
+
+    pub(crate) fn can_trap(&self) -> bool {
+        !self.get_flags().notrap()
+    }
+}
+
+//=============================================================================
+// Instruction sub-components (conditions, branches and branch targets):
+// definitions
+
+/// Condition for conditional branches.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Cond {
+    mask: u8,
+}
+
+impl Cond {
+    pub fn from_mask(mask: u8) -> Cond {
+        assert!(mask >= 1 && mask <= 14);
+        Cond { mask }
+    }
+
+    pub fn from_intcc(cc: IntCC) -> Cond {
+        let mask = match cc {
+            IntCC::Equal => 8,
+            IntCC::NotEqual => 4 | 2,
+            IntCC::SignedGreaterThanOrEqual => 8 | 2,
+            IntCC::SignedGreaterThan => 2,
+            IntCC::SignedLessThanOrEqual => 8 | 4,
+            IntCC::SignedLessThan => 4,
+            IntCC::UnsignedGreaterThanOrEqual => 8 | 2,
+            IntCC::UnsignedGreaterThan => 2,
+            IntCC::UnsignedLessThanOrEqual => 8 | 4,
+            IntCC::UnsignedLessThan => 4,
+            IntCC::Overflow => 1,
+            IntCC::NotOverflow => 8 | 4 | 2,
+        };
+        Cond { mask }
+    }
+
+    pub fn from_floatcc(cc: FloatCC) -> Cond {
+        let mask = match cc {
+            FloatCC::Ordered => 8 | 4 | 2,
+            FloatCC::Unordered => 1,
+            FloatCC::Equal => 8,
+            FloatCC::NotEqual => 4 | 2 | 1,
+            FloatCC::OrderedNotEqual => 4 | 2,
+            FloatCC::UnorderedOrEqual => 8 | 1,
+            FloatCC::LessThan => 4,
+            FloatCC::LessThanOrEqual => 8 | 4,
+            FloatCC::GreaterThan => 2,
+            FloatCC::GreaterThanOrEqual => 8 | 2,
+            FloatCC::UnorderedOrLessThan => 4 | 1,
+            FloatCC::UnorderedOrLessThanOrEqual => 8 | 4 | 1,
+            FloatCC::UnorderedOrGreaterThan => 2 | 1,
+            FloatCC::UnorderedOrGreaterThanOrEqual => 8 | 2 | 1,
+        };
+        Cond { mask }
+    }
+
+    /// Return the inverted condition.
+    pub fn invert(self) -> Cond {
+        Cond {
+            mask: !self.mask & 15,
+        }
+    }
+
+    /// Return the machine encoding of this condition.
+    pub fn bits(self) -> u8 {
+        self.mask
+    }
+}
+
+/// A branch target. Either unresolved (basic-block index) or resolved (offset
+/// from end of current instruction).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum BranchTarget {
+    /// An unresolved reference to a Label, as passed into
+    /// `lower_branch_group()`.
+    Label(MachLabel),
+    /// A fixed PC offset.
+    ResolvedOffset(i32),
+}
+
+impl BranchTarget {
+    /// Return the target's label, if it is a label-based target.
+    pub fn as_label(self) -> Option<MachLabel> {
+        match self {
+            BranchTarget::Label(l) => Some(l),
+            _ => None,
+        }
+    }
+
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_ri_offset_or_zero(self) -> u16 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 1,
+            _ => 0,
+        };
+        assert!(off <= 0x7fff);
+        assert!(off >= -0x8000);
+        off as u16
+    }
+
+    /// Return the target's offset, if specified, or zero if label-based.
+    pub fn as_ril_offset_or_zero(self) -> u32 {
+        let off = match self {
+            BranchTarget::ResolvedOffset(off) => off >> 1,
+            _ => 0,
+        };
+        off as u32
+    }
+}
+
+impl PrettyPrint for MemArg {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &MemArg::BXD12 {
+                base, index, disp, ..
+            } => {
+                if base != zero_reg() {
+                    if index != zero_reg() {
+                        format!(
+                            "{}({},{})",
+                            disp.show_rru(mb_rru),
+                            index.show_rru(mb_rru),
+                            base.show_rru(mb_rru)
+                        )
+                    } else {
+                        format!("{}({})", disp.show_rru(mb_rru), base.show_rru(mb_rru))
+                    }
+                } else {
+                    if index != zero_reg() {
+                        format!("{}({},)", disp.show_rru(mb_rru), index.show_rru(mb_rru))
+                    } else {
+                        format!("{}", disp.show_rru(mb_rru))
+                    }
+                }
+            }
+            &MemArg::BXD20 {
+                base, index, disp, ..
+            } => {
+                if base != zero_reg() {
+                    if index != zero_reg() {
+                        format!(
+                            "{}({},{})",
+                            disp.show_rru(mb_rru),
+                            index.show_rru(mb_rru),
+                            base.show_rru(mb_rru)
+                        )
+                    } else {
+                        format!("{}({})", disp.show_rru(mb_rru), base.show_rru(mb_rru))
+                    }
+                } else {
+                    if index != zero_reg() {
+                        format!("{}({},)", disp.show_rru(mb_rru), index.show_rru(mb_rru))
+                    } else {
+                        format!("{}", disp.show_rru(mb_rru))
+                    }
+                }
+            }
+            &MemArg::Label { ref target } => target.show_rru(mb_rru),
+            &MemArg::Symbol {
+                ref name, offset, ..
+            } => format!("{} + {}", name, offset),
+            // Eliminated by `mem_finalize()`.
+            &MemArg::InitialSPOffset { .. }
+            | &MemArg::NominalSPOffset { .. }
+            | &MemArg::RegOffset { .. } => {
+                panic!("Unexpected pseudo mem-arg mode (stack-offset or generic reg-offset)!")
+            }
+        }
+    }
+}
+
+impl PrettyPrint for Cond {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let s = match self.mask {
+            1 => "o",
+            2 => "h",
+            3 => "nle",
+            4 => "l",
+            5 => "nhe",
+            6 => "lh",
+            7 => "ne",
+            8 => "e",
+            9 => "nlh",
+            10 => "he",
+            11 => "nl",
+            12 => "le",
+            13 => "nh",
+            14 => "no",
+            _ => unreachable!(),
+        };
+        s.to_string()
+    }
+}
+
+impl PrettyPrint for BranchTarget {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &BranchTarget::Label(label) => format!("label{:?}", label.get()),
+            &BranchTarget::ResolvedOffset(off) => format!("{}", off),
+        }
+    }
+}
--- a/cranelift/codegen/src/isa/s390x/inst/emit.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs
--- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
--- a/cranelift/codegen/src/isa/s390x/inst/imms.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/imms.rs
@@ -0,0 +1,231 @@
+//! S390x ISA definitions: immediate constants.
+
+use regalloc::{PrettyPrint, RealRegUniverse};
+use std::string::String;
+
+/// An unsigned 12-bit immediate.
+#[derive(Clone, Copy, Debug)]
+pub struct UImm12 {
+    /// The value.
+    value: u16,
+}
+
+impl UImm12 {
+    pub fn maybe_from_u64(value: u64) -> Option<UImm12> {
+        if value < 4096 {
+            Some(UImm12 {
+                value: value as u16,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero() -> UImm12 {
+        UImm12 { value: 0 }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        u32::from(self.value)
+    }
+}
+
+/// A signed 20-bit immediate.
+#[derive(Clone, Copy, Debug)]
+pub struct SImm20 {
+    /// The value.
+    value: i32,
+}
+
+impl SImm20 {
+    pub fn maybe_from_i64(value: i64) -> Option<SImm20> {
+        if value >= -524288 && value < 524288 {
+            Some(SImm20 {
+                value: value as i32,
+            })
+        } else {
+            None
+        }
+    }
+
+    pub fn from_uimm12(value: UImm12) -> SImm20 {
+        SImm20 {
+            value: value.bits() as i32,
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero() -> SImm20 {
+        SImm20 { value: 0 }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        let encoded: u32 = self.value as u32;
+        encoded & 0xfffff
+    }
+}
+
+/// A 16-bit immediate with a {0,16,32,48}-bit shift.
+#[derive(Clone, Copy, Debug)]
+pub struct UImm16Shifted {
+    /// The value.
+    pub bits: u16,
+    /// Result is `bits` shifted 16*shift bits to the left.
+    pub shift: u8,
+}
+
+impl UImm16Shifted {
+    /// Construct a UImm16Shifted from an arbitrary 64-bit constant if possible.
+    pub fn maybe_from_u64(value: u64) -> Option<UImm16Shifted> {
+        let mask0 = 0x0000_0000_0000_ffffu64;
+        let mask1 = 0x0000_0000_ffff_0000u64;
+        let mask2 = 0x0000_ffff_0000_0000u64;
+        let mask3 = 0xffff_0000_0000_0000u64;
+
+        if value == (value & mask0) {
+            return Some(UImm16Shifted {
+                bits: (value & mask0) as u16,
+                shift: 0,
+            });
+        }
+        if value == (value & mask1) {
+            return Some(UImm16Shifted {
+                bits: ((value >> 16) & mask0) as u16,
+                shift: 1,
+            });
+        }
+        if value == (value & mask2) {
+            return Some(UImm16Shifted {
+                bits: ((value >> 32) & mask0) as u16,
+                shift: 2,
+            });
+        }
+        if value == (value & mask3) {
+            return Some(UImm16Shifted {
+                bits: ((value >> 48) & mask0) as u16,
+                shift: 3,
+            });
+        }
+        None
+    }
+
+    pub fn maybe_with_shift(imm: u16, shift: u8) -> Option<UImm16Shifted> {
+        let shift_enc = shift / 16;
+        if shift_enc > 3 {
+            None
+        } else {
+            Some(UImm16Shifted {
+                bits: imm,
+                shift: shift_enc,
+            })
+        }
+    }
+
+    pub fn negate_bits(&self) -> UImm16Shifted {
+        UImm16Shifted {
+            bits: !self.bits,
+            shift: self.shift,
+        }
+    }
+
+    /// Returns the value that this constant represents.
+    pub fn value(&self) -> u64 {
+        (self.bits as u64) << (16 * self.shift)
+    }
+}
+
+/// A 32-bit immediate with a {0,32}-bit shift.
+#[derive(Clone, Copy, Debug)]
+pub struct UImm32Shifted {
+    /// The value.
+    pub bits: u32,
+    /// Result is `bits` shifted 32*shift bits to the left.
+    pub shift: u8,
+}
+
+impl UImm32Shifted {
+    /// Construct a UImm32Shifted from an arbitrary 64-bit constant if possible.
+    pub fn maybe_from_u64(value: u64) -> Option<UImm32Shifted> {
+        let mask0 = 0x0000_0000_ffff_ffffu64;
+        let mask1 = 0xffff_ffff_0000_0000u64;
+
+        if value == (value & mask0) {
+            return Some(UImm32Shifted {
+                bits: (value & mask0) as u32,
+                shift: 0,
+            });
+        }
+        if value == (value & mask1) {
+            return Some(UImm32Shifted {
+                bits: ((value >> 32) & mask0) as u32,
+                shift: 1,
+            });
+        }
+        None
+    }
+
+    pub fn maybe_with_shift(imm: u32, shift: u8) -> Option<UImm32Shifted> {
+        let shift_enc = shift / 32;
+        if shift_enc > 3 {
+            None
+        } else {
+            Some(UImm32Shifted {
+                bits: imm,
+                shift: shift_enc,
+            })
+        }
+    }
+
+    pub fn from_uimm16shifted(value: UImm16Shifted) -> UImm32Shifted {
+        if value.shift % 2 == 0 {
+            UImm32Shifted {
+                bits: value.bits as u32,
+                shift: value.shift / 2,
+            }
+        } else {
+            UImm32Shifted {
+                bits: (value.bits as u32) << 16,
+                shift: value.shift / 2,
+            }
+        }
+    }
+
+    pub fn negate_bits(&self) -> UImm32Shifted {
+        UImm32Shifted {
+            bits: !self.bits,
+            shift: self.shift,
+        }
+    }
+
+    /// Returns the value that this constant represents.
+    pub fn value(&self) -> u64 {
+        (self.bits as u64) << (32 * self.shift)
+    }
+}
+
+impl PrettyPrint for UImm12 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{}", self.value)
+    }
+}
+
+impl PrettyPrint for SImm20 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{}", self.value)
+    }
+}
+
+impl PrettyPrint for UImm16Shifted {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{}", self.bits)
+    }
+}
+
+impl PrettyPrint for UImm32Shifted {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{}", self.bits)
+    }
+}
--- a/cranelift/codegen/src/isa/s390x/inst/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs
--- a/cranelift/codegen/src/isa/s390x/inst/regs.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/regs.rs
@@ -0,0 +1,168 @@
+//! S390x ISA definitions: registers.
+
+use crate::settings;
+use regalloc::{RealRegUniverse, Reg, RegClass, RegClassInfo, Writable, NUM_REG_CLASSES};
+
+//=============================================================================
+// Registers, the Universe thereof, and printing
+
+#[rustfmt::skip]
+const GPR_INDICES: [u8; 16] = [
+    // r0 and r1 reserved
+    30, 31,
+    // r2 - r5 call-clobbered
+    16, 17, 18, 19,
+    // r6 - r14 call-saved (order reversed)
+    28, 27, 26, 25, 24, 23, 22, 21, 20,
+    // r15 (SP)
+    29,
+];
+
+#[rustfmt::skip]
+const FPR_INDICES: [u8; 16] = [
+    // f0 - f7 as pairs
+    0, 4, 1, 5, 2, 6, 3, 7,
+    // f8 - f15 as pairs
+    8, 12, 9, 13, 10, 14, 11, 15,
+];
+
+/// Get a reference to a GPR (integer register).
+pub fn gpr(num: u8) -> Reg {
+    assert!(num < 16);
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ num,
+        /* index = */ GPR_INDICES[num as usize],
+    )
+}
+
+/// Get a writable reference to a GPR.
+pub fn writable_gpr(num: u8) -> Writable<Reg> {
+    Writable::from_reg(gpr(num))
+}
+
+/// Get a reference to a FPR (floating-point register).
+pub fn fpr(num: u8) -> Reg {
+    assert!(num < 16);
+    Reg::new_real(
+        RegClass::F64,
+        /* enc = */ num,
+        /* index = */ FPR_INDICES[num as usize],
+    )
+}
+
+/// Get a writable reference to a V-register.
+pub fn writable_fpr(num: u8) -> Writable<Reg> {
+    Writable::from_reg(fpr(num))
+}
+
+/// Get a reference to the stack-pointer register.
+pub fn stack_reg() -> Reg {
+    gpr(15)
+}
+
+/// Get a writable reference to the stack-pointer register.
+pub fn writable_stack_reg() -> Writable<Reg> {
+    Writable::from_reg(stack_reg())
+}
+
+/// Get a reference to the first temporary, sometimes "spill temporary", register. This register is
+/// used to compute the address of a spill slot when a direct offset addressing mode from FP is not
+/// sufficient (+/- 2^11 words). We exclude this register from regalloc and reserve it for this
+/// purpose for simplicity; otherwise we need a multi-stage analysis where we first determine how
+/// many spill slots we have, then perhaps remove the reg from the pool and recompute regalloc.
+///
+/// We use r1 for this because it's a scratch register but is slightly special (used for linker
+/// veneers). We're free to use it as long as we don't expect it to live through call instructions.
+pub fn spilltmp_reg() -> Reg {
+    gpr(1)
+}
+
+/// Get a writable reference to the spilltmp reg.
+pub fn writable_spilltmp_reg() -> Writable<Reg> {
+    Writable::from_reg(spilltmp_reg())
+}
+
+pub fn zero_reg() -> Reg {
+    gpr(0)
+}
+
+/// Create the register universe for AArch64.
+pub fn create_reg_universe(_flags: &settings::Flags) -> RealRegUniverse {
+    let mut regs = vec![];
+    let mut allocable_by_class = [None; NUM_REG_CLASSES];
+
+    // Numbering Scheme: we put FPRs first, then GPRs. The GPRs exclude several registers:
+    // r0 (we cannot use this for addressing // FIXME regalloc)
+    // r1 (spilltmp)
+    // r15 (stack pointer)
+
+    // FPRs.
+    let mut base = regs.len();
+    regs.push((fpr(0).to_real_reg(), "%f0".into()));
+    regs.push((fpr(2).to_real_reg(), "%f2".into()));
+    regs.push((fpr(4).to_real_reg(), "%f4".into()));
+    regs.push((fpr(6).to_real_reg(), "%f6".into()));
+    regs.push((fpr(1).to_real_reg(), "%f1".into()));
+    regs.push((fpr(3).to_real_reg(), "%f3".into()));
+    regs.push((fpr(5).to_real_reg(), "%f5".into()));
+    regs.push((fpr(7).to_real_reg(), "%f7".into()));
+    regs.push((fpr(8).to_real_reg(), "%f8".into()));
+    regs.push((fpr(10).to_real_reg(), "%f10".into()));
+    regs.push((fpr(12).to_real_reg(), "%f12".into()));
+    regs.push((fpr(14).to_real_reg(), "%f14".into()));
+    regs.push((fpr(9).to_real_reg(), "%f9".into()));
+    regs.push((fpr(11).to_real_reg(), "%f11".into()));
+    regs.push((fpr(13).to_real_reg(), "%f13".into()));
+    regs.push((fpr(15).to_real_reg(), "%f15".into()));
+
+    allocable_by_class[RegClass::F64.rc_to_usize()] = Some(RegClassInfo {
+        first: base,
+        last: regs.len() - 1,
+        suggested_scratch: Some(fpr(1).get_index()),
+    });
+
+    // Caller-saved GPRs in the SystemV s390x ABI.
+    base = regs.len();
+    regs.push((gpr(2).to_real_reg(), "%r2".into()));
+    regs.push((gpr(3).to_real_reg(), "%r3".into()));
+    regs.push((gpr(4).to_real_reg(), "%r4".into()));
+    regs.push((gpr(5).to_real_reg(), "%r5".into()));
+
+    // Callee-saved GPRs in the SystemV s390x ABI.
+    // We start from r14 downwards in an attempt to allow the
+    // prolog to use as short a STMG as possible.
+    regs.push((gpr(14).to_real_reg(), "%r14".into()));
+    regs.push((gpr(13).to_real_reg(), "%r13".into()));
+    regs.push((gpr(12).to_real_reg(), "%r12".into()));
+    regs.push((gpr(11).to_real_reg(), "%r11".into()));
+    regs.push((gpr(10).to_real_reg(), "%r10".into()));
+    regs.push((gpr(9).to_real_reg(), "%r9".into()));
+    regs.push((gpr(8).to_real_reg(), "%r8".into()));
+    regs.push((gpr(7).to_real_reg(), "%r7".into()));
+    regs.push((gpr(6).to_real_reg(), "%r6".into()));
+
+    allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo {
+        first: base,
+        last: regs.len() - 1,
+        suggested_scratch: Some(gpr(13).get_index()),
+    });
+
+    // Other regs, not available to the allocator.
+    let allocable = regs.len();
+    regs.push((gpr(15).to_real_reg(), "%r15".into()));
+    regs.push((gpr(0).to_real_reg(), "%r0".into()));
+    regs.push((gpr(1).to_real_reg(), "%r1".into()));
+
+    // Assert sanity: the indices in the register structs must match their
+    // actual indices in the array.
+    for (i, reg) in regs.iter().enumerate() {
+        assert_eq!(i, reg.0.get_index());
+    }
+
+    RealRegUniverse {
+        regs,
+        allocable,
+        allocable_by_class,
+    }
+}
--- a/cranelift/codegen/src/isa/s390x/inst/unwind.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/unwind.rs
@@ -0,0 +1,2 @@
+#[cfg(feature = "unwind")]
+pub(crate) mod systemv;
--- a/cranelift/codegen/src/isa/s390x/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/unwind/systemv.rs
@@ -0,0 +1,197 @@
+//! Unwind information for System V ABI (s390x).
+
+use crate::isa::unwind::systemv::RegisterMappingError;
+use gimli::{write::CommonInformationEntry, Encoding, Format, Register};
+use regalloc::{Reg, RegClass};
+
+/// Creates a new s390x common information entry (CIE).
+pub fn create_cie() -> CommonInformationEntry {
+    use gimli::write::CallFrameInstruction;
+
+    let mut entry = CommonInformationEntry::new(
+        Encoding {
+            address_size: 8,
+            format: Format::Dwarf32,
+            version: 1,
+        },
+        1,            // Code alignment factor
+        -8,           // Data alignment factor
+        Register(14), // Return address column - register %r14
+    );
+
+    // Every frame will start with the call frame address (CFA) at %r15 + 160.
+    entry.add_instruction(CallFrameInstruction::Cfa(Register(15), 160));
+
+    entry
+}
+
+/// Map Cranelift registers to their corresponding Gimli registers.
+pub fn map_reg(reg: Reg) -> Result<Register, RegisterMappingError> {
+    const GPR_MAP: [gimli::Register; 16] = [
+        Register(0),
+        Register(1),
+        Register(2),
+        Register(3),
+        Register(4),
+        Register(5),
+        Register(6),
+        Register(7),
+        Register(8),
+        Register(9),
+        Register(10),
+        Register(11),
+        Register(12),
+        Register(13),
+        Register(14),
+        Register(15),
+    ];
+    const FPR_MAP: [gimli::Register; 16] = [
+        Register(16),
+        Register(20),
+        Register(17),
+        Register(21),
+        Register(18),
+        Register(22),
+        Register(19),
+        Register(23),
+        Register(24),
+        Register(28),
+        Register(25),
+        Register(29),
+        Register(26),
+        Register(30),
+        Register(27),
+        Register(31),
+    ];
+
+    match reg.get_class() {
+        RegClass::I64 => Ok(GPR_MAP[reg.get_hw_encoding() as usize]),
+        RegClass::F64 => Ok(FPR_MAP[reg.get_hw_encoding() as usize]),
+        _ => Err(RegisterMappingError::UnsupportedRegisterBank("class?")),
+    }
+}
+
+pub(crate) struct RegisterMapper;
+
+impl crate::isa::unwind::systemv::RegisterMapper<Reg> for RegisterMapper {
+    fn map(&self, reg: Reg) -> Result<u16, RegisterMappingError> {
+        Ok(map_reg(reg)?.0)
+    }
+    fn sp(&self) -> u16 {
+        Register(15).0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::{
+        types, AbiParam, ExternalName, Function, InstBuilder, Signature, StackSlotData,
+        StackSlotKind,
+    };
+    use crate::isa::{lookup, CallConv};
+    use crate::settings::{builder, Flags};
+    use crate::Context;
+    use gimli::write::Address;
+    use std::str::FromStr;
+    use target_lexicon::triple;
+
+    #[test]
+    fn test_simple_func() {
+        let isa = lookup(triple!("s390x"))
+            .expect("expect s390x ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_function(
+            CallConv::SystemV,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match context
+            .create_unwind_info(isa.as_ref())
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(1234))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(1234), length: 10, lsda: None, instructions: [(4, CfaOffset(224))] }");
+    }
+
+    fn create_function(call_conv: CallConv, stack_slot: Option<StackSlotData>) -> Function {
+        let mut func =
+            Function::with_name_signature(ExternalName::user(0, 0), Signature::new(call_conv));
+
+        let block0 = func.dfg.make_block();
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().return_(&[]);
+
+        if let Some(stack_slot) = stack_slot {
+            func.stack_slots.push(stack_slot);
+        }
+
+        func
+    }
+
+    #[test]
+    fn test_multi_return_func() {
+        let isa = lookup(triple!("s390x"))
+            .expect("expect s390x ISA")
+            .finish(Flags::new(builder()));
+
+        let mut context = Context::for_function(create_multi_return_function(
+            CallConv::SystemV,
+            Some(StackSlotData::new(StackSlotKind::ExplicitSlot, 64)),
+        ));
+
+        context.compile(&*isa).expect("expected compilation");
+
+        let fde = match context
+            .create_unwind_info(isa.as_ref())
+            .expect("can create unwind info")
+        {
+            Some(crate::isa::unwind::UnwindInfo::SystemV(info)) => {
+                info.to_fde(Address::Constant(4321))
+            }
+            _ => panic!("expected unwind information"),
+        };
+
+        assert_eq!(format!("{:?}", fde), "FrameDescriptionEntry { address: Constant(4321), length: 26, lsda: None, instructions: [(4, CfaOffset(224))] }");
+    }
+
+    fn create_multi_return_function(
+        call_conv: CallConv,
+        stack_slot: Option<StackSlotData>,
+    ) -> Function {
+        let mut sig = Signature::new(call_conv);
+        sig.params.push(AbiParam::new(types::I32));
+        let mut func = Function::with_name_signature(ExternalName::user(0, 0), sig);
+
+        let block0 = func.dfg.make_block();
+        let v0 = func.dfg.append_block_param(block0, types::I32);
+        let block1 = func.dfg.make_block();
+        let block2 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(block0);
+        pos.ins().brnz(v0, block2, &[]);
+        pos.ins().jump(block1, &[]);
+
+        pos.insert_block(block1);
+        pos.ins().return_(&[]);
+
+        pos.insert_block(block2);
+        pos.ins().return_(&[]);
+
+        if let Some(stack_slot) = stack_slot {
+            func.stack_slots.push(stack_slot);
+        }
+
+        func
+    }
+}
--- a/cranelift/codegen/src/isa/s390x/lower.rs
+++ b/cranelift/codegen/src/isa/s390x/lower.rs
--- a/cranelift/codegen/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/mod.rs
@@ -0,0 +1,296 @@
+//! IBM Z 64-bit Instruction Set Architecture.
+
+use crate::ir::condcodes::IntCC;
+use crate::ir::Function;
+use crate::isa::s390x::settings as s390x_settings;
+use crate::isa::unwind::systemv::RegisterMappingError;
+use crate::isa::Builder as IsaBuilder;
+use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
+use crate::result::CodegenResult;
+use crate::settings as shared_settings;
+
+use alloc::{boxed::Box, vec::Vec};
+use core::hash::{Hash, Hasher};
+
+use regalloc::{PrettyPrint, RealRegUniverse, Reg};
+use target_lexicon::{Architecture, Triple};
+
+// New backend:
+mod abi;
+pub(crate) mod inst;
+mod lower;
+mod settings;
+
+use inst::create_reg_universe;
+
+use self::inst::EmitInfo;
+
+/// A IBM Z backend.
+pub struct S390xBackend {
+    triple: Triple,
+    flags: shared_settings::Flags,
+    isa_flags: s390x_settings::Flags,
+    reg_universe: RealRegUniverse,
+}
+
+impl S390xBackend {
+    /// Create a new IBM Z backend with the given (shared) flags.
+    pub fn new_with_flags(
+        triple: Triple,
+        flags: shared_settings::Flags,
+        isa_flags: s390x_settings::Flags,
+    ) -> S390xBackend {
+        let reg_universe = create_reg_universe(&flags);
+        S390xBackend {
+            triple,
+            flags,
+            isa_flags,
+            reg_universe,
+        }
+    }
+
+    /// This performs lowering to VCode, register-allocates the code, computes block layout and
+    /// finalizes branches. The result is ready for binary emission.
+    fn compile_vcode(
+        &self,
+        func: &Function,
+        flags: shared_settings::Flags,
+    ) -> CodegenResult<VCode<inst::Inst>> {
+        let emit_info = EmitInfo::new(flags.clone());
+        let abi = Box::new(abi::S390xABICallee::new(func, flags)?);
+        compile::compile::<S390xBackend>(func, self, abi, emit_info)
+    }
+}
+
+impl MachBackend for S390xBackend {
+    fn compile_function(
+        &self,
+        func: &Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult> {
+        let flags = self.flags();
+        let vcode = self.compile_vcode(func, flags.clone())?;
+        let buffer = vcode.emit();
+        let frame_size = vcode.frame_size();
+        let value_labels_ranges = vcode.value_labels_ranges();
+        let stackslot_offsets = vcode.stackslot_offsets().clone();
+
+        let disasm = if want_disasm {
+            Some(vcode.show_rru(Some(&create_reg_universe(flags))))
+        } else {
+            None
+        };
+
+        let buffer = buffer.finish();
+
+        Ok(MachCompileResult {
+            buffer,
+            frame_size,
+            disasm,
+            value_labels_ranges,
+            stackslot_offsets,
+        })
+    }
+
+    fn name(&self) -> &'static str {
+        "s390x"
+    }
+
+    fn triple(&self) -> Triple {
+        self.triple.clone()
+    }
+
+    fn flags(&self) -> &shared_settings::Flags {
+        &self.flags
+    }
+
+    fn isa_flags(&self) -> Vec<shared_settings::Value> {
+        self.isa_flags.iter().collect()
+    }
+
+    fn hash_all_flags(&self, mut hasher: &mut dyn Hasher) {
+        self.flags.hash(&mut hasher);
+        self.isa_flags.hash(&mut hasher);
+    }
+
+    fn reg_universe(&self) -> &RealRegUniverse {
+        &self.reg_universe
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> IntCC {
+        unimplemented!()
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> IntCC {
+        unimplemented!()
+    }
+
+    #[cfg(feature = "unwind")]
+    fn emit_unwind_info(
+        &self,
+        result: &MachCompileResult,
+        kind: crate::machinst::UnwindInfoKind,
+    ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> {
+        use crate::isa::unwind::UnwindInfo;
+        use crate::machinst::UnwindInfoKind;
+        Ok(match kind {
+            UnwindInfoKind::SystemV => {
+                let mapper = self::inst::unwind::systemv::RegisterMapper;
+                Some(UnwindInfo::SystemV(
+                    crate::isa::unwind::systemv::create_unwind_info_from_insts(
+                        &result.buffer.unwind_info[..],
+                        result.buffer.data.len(),
+                        &mapper,
+                    )?,
+                ))
+            }
+            _ => None,
+        })
+    }
+
+    #[cfg(feature = "unwind")]
+    fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
+        Some(inst::unwind::systemv::create_cie())
+    }
+
+    #[cfg(feature = "unwind")]
+    fn map_reg_to_dwarf(&self, reg: Reg) -> Result<u16, RegisterMappingError> {
+        inst::unwind::systemv::map_reg(reg).map(|reg| reg.0)
+    }
+}
+
+/// Create a new `isa::Builder`.
+pub fn isa_builder(triple: Triple) -> IsaBuilder {
+    assert!(triple.architecture == Architecture::S390x);
+    IsaBuilder {
+        triple,
+        setup: s390x_settings::builder(),
+        constructor: |triple, shared_flags, builder| {
+            let isa_flags = s390x_settings::Flags::new(&shared_flags, builder);
+            let backend = S390xBackend::new_with_flags(triple, shared_flags, isa_flags);
+            Box::new(TargetIsaAdapter::new(backend))
+        },
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+    use crate::settings;
+    use crate::settings::Configurable;
+    use core::str::FromStr;
+    use target_lexicon::Triple;
+
+    #[test]
+    fn test_compile_function() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().return_(&[v1]);
+
+        let mut shared_flags_builder = settings::builder();
+        shared_flags_builder.set("opt_level", "none").unwrap();
+        let shared_flags = settings::Flags::new(shared_flags_builder);
+        let isa_flags = s390x_settings::Flags::new(&shared_flags, s390x_settings::builder());
+        let backend = S390xBackend::new_with_flags(
+            Triple::from_str("s390x").unwrap(),
+            shared_flags,
+            isa_flags,
+        );
+        let result = backend
+            .compile_function(&mut func, /* want_disasm = */ false)
+            .unwrap();
+        let code = &result.buffer.data[..];
+
+        // ahi %r2, 0x1234
+        // br %r14
+        let golden = vec![0xa7, 0x2a, 0x12, 0x34, 0x07, 0xfe];
+
+        assert_eq!(code, &golden[..]);
+    }
+
+    #[test]
+    fn test_branch_lowering() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+        let bb1 = func.dfg.make_block();
+        let bb2 = func.dfg.make_block();
+        let bb3 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().brnz(v1, bb1, &[]);
+        pos.ins().jump(bb2, &[]);
+        pos.insert_block(bb1);
+        pos.ins().brnz(v1, bb2, &[]);
+        pos.ins().jump(bb3, &[]);
+        pos.insert_block(bb2);
+        let v2 = pos.ins().iadd(v1, v0);
+        pos.ins().brnz(v2, bb2, &[]);
+        pos.ins().jump(bb1, &[]);
+        pos.insert_block(bb3);
+        let v3 = pos.ins().isub(v1, v0);
+        pos.ins().return_(&[v3]);
+
+        let mut shared_flags_builder = settings::builder();
+        shared_flags_builder.set("opt_level", "none").unwrap();
+        let shared_flags = settings::Flags::new(shared_flags_builder);
+        let isa_flags = s390x_settings::Flags::new(&shared_flags, s390x_settings::builder());
+        let backend = S390xBackend::new_with_flags(
+            Triple::from_str("s390x").unwrap(),
+            shared_flags,
+            isa_flags,
+        );
+        let result = backend
+            .compile_function(&mut func, /* want_disasm = */ false)
+            .unwrap();
+        let code = &result.buffer.data[..];
+
+        // FIXME: the branching logic should be optimized more
+
+        // ahi %r2, 4660
+        // chi %r2, 0
+        // jglh label1 ; jg label2
+        // jg label6
+        // jg label3
+        // ahik %r3, %r2, 4660
+        // chi %r3, 0
+        // jglh label4 ; jg label5
+        // jg label3
+        // jg label6
+        // chi %r2, 0
+        // jglh label7 ; jg label8
+        // jg label3
+        // ahi %r2, -4660
+        // br %r14
+        let golden = vec![
+            167, 42, 18, 52, 167, 46, 0, 0, 192, 100, 0, 0, 0, 11, 236, 50, 18, 52, 0, 216, 167,
+            62, 0, 0, 192, 100, 255, 255, 255, 251, 167, 46, 0, 0, 192, 100, 255, 255, 255, 246,
+            167, 42, 237, 204, 7, 254,
+        ];
+
+        assert_eq!(code, &golden[..]);
+    }
+}
--- a/cranelift/codegen/src/isa/s390x/settings.rs
+++ b/cranelift/codegen/src/isa/s390x/settings.rs
@@ -0,0 +1,9 @@
+//! S390X Settings.
+
+use crate::settings::{self, detail, Builder, Value};
+use core::fmt;
+
+// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
+// public `Flags` struct with an impl for all of the settings defined in
+// `cranelift-codegen/meta/src/isa/s390x/settings.rs`.
+include!(concat!(env!("OUT_DIR"), "/settings-s390x.rs"));
--- a/cranelift/codegen/src/isa/unwind.rs
+++ b/cranelift/codegen/src/isa/unwind.rs
@@ -225,6 +225,11 @@ pub enum UnwindInst {
        /// the clobber area.
        offset_downward_to_clobbers: u32,
    },
+    /// The stack pointer was adjusted to allocate the stack.
+    StackAlloc {
+        /// Size to allocate.
+        size: u32,
+    },
    /// The stack slot at the given offset from the clobber-area base has been
    /// used to save the given register.
    ///
--- a/cranelift/codegen/src/isa/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/unwind/systemv.rs
@@ -6,7 +6,6 @@ use crate::isa::unwind::UnwindInst;
 use crate::result::{CodegenError, CodegenResult};
 use alloc::vec::Vec;
 use gimli::write::{Address, FrameDescriptionEntry};
-use thiserror::Error;

 #[cfg(feature = "enable-serde")]
 use serde::{Deserialize, Serialize};
@@ -15,16 +14,32 @@ type Register = u16;

 /// Enumerate the errors possible in mapping Cranelift registers to their DWARF equivalent.
 #[allow(missing_docs)]
-#[derive(Error, Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq)]
 pub enum RegisterMappingError {
-    #[error("unable to find bank for register info")]
    MissingBank,
-    #[error("register mapping is currently only implemented for x86_64")]
    UnsupportedArchitecture,
-    #[error("unsupported register bank: {0}")]
    UnsupportedRegisterBank(&'static str),
 }

+// This is manually implementing Error and Display instead of using thiserror to reduce the amount
+// of dependencies used by Cranelift.
+impl std::error::Error for RegisterMappingError {}
+
+impl std::fmt::Display for RegisterMappingError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            RegisterMappingError::MissingBank => write!(f, "unable to find bank for register info"),
+            RegisterMappingError::UnsupportedArchitecture => write!(
+                f,
+                "register mapping is currently only implemented for x86_64"
+            ),
+            RegisterMappingError::UnsupportedRegisterBank(bank) => {
+                write!(f, "unsupported register bank: {}", bank)
+            }
+        }
+    }
+}
+
 // This mirrors gimli's CallFrameInstruction, but is serializable
 // This excludes CfaExpression, Expression, ValExpression due to
 // https://github.com/gimli-rs/gimli/issues/513.
@@ -122,8 +137,10 @@ pub(crate) trait RegisterMapper<Reg> {
    fn map(&self, reg: Reg) -> Result<Register, RegisterMappingError>;
    /// Gets stack pointer register.
    fn sp(&self) -> Register;
-    /// Gets the frame pointer register.
-    fn fp(&self) -> Register;
+    /// Gets the frame pointer register, if any.
+    fn fp(&self) -> Option<Register> {
+        None
+    }
    /// Gets the link register, if any.
    fn lr(&self) -> Option<Register> {
        None
@@ -151,6 +168,7 @@ pub(crate) fn create_unwind_info_from_insts<MR: RegisterMapper<regalloc::Reg>>(
 ) -> CodegenResult<UnwindInfo> {
    let mut instructions = vec![];

+    let mut cfa_offset = 0;
    let mut clobber_offset_to_cfa = 0;
    for &(instruction_offset, ref inst) in insts {
        match inst {
@@ -163,10 +181,14 @@ pub(crate) fn create_unwind_info_from_insts<MR: RegisterMapper<regalloc::Reg>>(
                    instruction_offset,
                    CallFrameInstruction::CfaOffset(offset_upward_to_caller_sp as i32),
                ));
-                // Note that we saved the old FP value on the stack.
+                // Note that we saved the old FP value on the stack.  Use of this
+                // operation implies that the target defines a FP register.
                instructions.push((
                    instruction_offset,
-                    CallFrameInstruction::Offset(mr.fp(), -(offset_upward_to_caller_sp as i32)),
+                    CallFrameInstruction::Offset(
+                        mr.fp().unwrap(),
+                        -(offset_upward_to_caller_sp as i32),
+                    ),
                ));
                // If there is a link register on this architecture, note that
                // we saved it as well.
@@ -188,15 +210,29 @@ pub(crate) fn create_unwind_info_from_insts<MR: RegisterMapper<regalloc::Reg>>(
                // Define CFA in terms of FP. Note that we assume it was already
                // defined correctly in terms of the current SP, and FP has just
                // been set to the current SP, so we do not need to change the
-                // offset, only the register.
-                instructions.push((
-                    instruction_offset,
-                    CallFrameInstruction::CfaRegister(mr.fp()),
-                ));
+                // offset, only the register.  (This is done only if the target
+                // defines a frame pointer register.)
+                if let Some(fp) = mr.fp() {
+                    instructions.push((instruction_offset, CallFrameInstruction::CfaRegister(fp)));
+                }
+                // Record initial CFA offset.  This will be used with later
+                // StackAlloc calls if we do not have a frame pointer.
+                cfa_offset = offset_upward_to_caller_sp;
                // Record distance from CFA downward to clobber area so we can
                // express clobber offsets later in terms of CFA.
                clobber_offset_to_cfa = offset_upward_to_caller_sp + offset_downward_to_clobbers;
            }
+            &UnwindInst::StackAlloc { size } => {
+                // If we do not use a frame pointer, we need to update the
+                // CFA offset whenever the stack pointer changes.
+                if mr.fp().is_none() {
+                    cfa_offset += size;
+                    instructions.push((
+                        instruction_offset,
+                        CallFrameInstruction::CfaOffset(cfa_offset as i32),
+                    ));
+                }
+            }
            &UnwindInst::SaveReg {
                clobber_offset,
                reg,
--- a/cranelift/codegen/src/isa/unwind/winx64.rs
+++ b/cranelift/codegen/src/isa/unwind/winx64.rs
@@ -3,14 +3,11 @@
 use crate::isa::unwind::input;
 use crate::result::{CodegenError, CodegenResult};
 use alloc::vec::Vec;
-use byteorder::{ByteOrder, LittleEndian};
 use log::warn;
 #[cfg(feature = "enable-serde")]
 use serde::{Deserialize, Serialize};

-#[cfg(feature = "x64")]
 use crate::binemit::CodeOffset;
-#[cfg(feature = "x64")]
 use crate::isa::unwind::UnwindInst;

 /// Maximum (inclusive) size of a "small" stack allocation
@@ -33,20 +30,20 @@ impl<'a> Writer<'a> {
        self.offset += 1;
    }

-    fn write_u16<T: ByteOrder>(&mut self, v: u16) {
-        T::write_u16(&mut self.buf[self.offset..(self.offset + 2)], v);
+    fn write_u16_le(&mut self, v: u16) {
+        self.buf[self.offset..(self.offset + 2)].copy_from_slice(&v.to_le_bytes());
        self.offset += 2;
    }

-    fn write_u32<T: ByteOrder>(&mut self, v: u32) {
-        T::write_u32(&mut self.buf[self.offset..(self.offset + 4)], v);
+    fn write_u32_le(&mut self, v: u32) {
+        self.buf[self.offset..(self.offset + 4)].copy_from_slice(&v.to_le_bytes());
        self.offset += 4;
    }
 }

 /// The supported unwind codes for the x64 Windows ABI.
 ///
-/// See: https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64
+/// See: <https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64>
 /// Only what is needed to describe the prologues generated by the Cranelift x86 ISA are represented here.
 /// Note: the Cranelift x86 ISA RU enum matches the Windows unwind GPR encoding values.
 #[allow(dead_code)]
@@ -123,11 +120,11 @@ impl UnwindCode {
                let scaled_stack_offset = stack_offset / 16;
                if scaled_stack_offset <= core::u16::MAX as u32 {
                    writer.write_u8((*reg << 4) | (op_small as u8));
-                    writer.write_u16::<LittleEndian>(scaled_stack_offset as u16);
+                    writer.write_u16_le(scaled_stack_offset as u16);
                } else {
                    writer.write_u8((*reg << 4) | (op_large as u8));
-                    writer.write_u16::<LittleEndian>(*stack_offset as u16);
-                    writer.write_u16::<LittleEndian>((stack_offset >> 16) as u16);
+                    writer.write_u16_le(*stack_offset as u16);
+                    writer.write_u16_le((stack_offset >> 16) as u16);
                }
            }
            Self::StackAlloc {
@@ -145,10 +142,10 @@ impl UnwindCode {
                    );
                } else if *size <= LARGE_ALLOC_16BIT_MAX_SIZE {
                    writer.write_u8(UnwindOperation::LargeStackAlloc as u8);
-                    writer.write_u16::<LittleEndian>((*size / 8) as u16);
+                    writer.write_u16_le((*size / 8) as u16);
                } else {
                    writer.write_u8((1 << 4) | (UnwindOperation::LargeStackAlloc as u8));
-                    writer.write_u32::<LittleEndian>(*size);
+                    writer.write_u32_le(*size);
                }
            }
            Self::SetFPReg { instruction_offset } => {
@@ -195,7 +192,7 @@ pub(crate) trait RegisterMapper<Reg> {
 /// Represents Windows x64 unwind information.
 ///
 /// For information about Windows x64 unwind info, see:
-/// https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64
+/// <https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64>
 #[derive(Clone, Debug, PartialEq, Eq)]
 #[cfg_attr(feature = "enable-serde", derive(Serialize, Deserialize))]
 pub struct UnwindInfo {
@@ -250,7 +247,7 @@ impl UnwindInfo {

        // To keep a 32-bit alignment, emit 2 bytes of padding if there's an odd number of 16-bit nodes
        if (node_count & 1) == 1 {
-            writer.write_u16::<LittleEndian>(0);
+            writer.write_u16_le(0);
        }

        // Ensure the correct number of bytes was emitted
@@ -334,10 +331,8 @@ impl UnwindInfo {
    }
 }

-#[cfg(feature = "x64")]
 const UNWIND_RBP_REG: u8 = 5;

-#[cfg(feature = "x64")]
 pub(crate) fn create_unwind_info_from_insts<MR: RegisterMapper<regalloc::Reg>>(
    insts: &[(CodeOffset, UnwindInst)],
 ) -> CodegenResult<UnwindInfo> {
@@ -360,6 +355,12 @@ pub(crate) fn create_unwind_info_from_insts<MR: RegisterMapper<regalloc::Reg>>(
                frame_register_offset = ensure_unwind_offset(offset_downward_to_clobbers)?;
                unwind_codes.push(UnwindCode::SetFPReg { instruction_offset });
            }
+            &UnwindInst::StackAlloc { size } => {
+                unwind_codes.push(UnwindCode::StackAlloc {
+                    instruction_offset,
+                    size,
+                });
+            }
            &UnwindInst::SaveReg {
                clobber_offset,
                reg,
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -237,10 +237,20 @@ impl ABIMachineSpec for X64ABIMachineSpec {
                        extension: param.extension,
                    });
                } else {
-                    // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte
-                    // stack alignment happens separately after all args.)
+                    // Compute size. For the wasmtime ABI it differs from native
+                    // ABIs in how multiple values are returned, so we take a
+                    // leaf out of arm64's book by not rounding everything up to
+                    // 8 bytes. For all ABI arguments, and other ABI returns,
+                    // though, each slot takes a minimum of 8 bytes.
+                    //
+                    // Note that in all cases 16-byte stack alignment happens
+                    // separately after all args.
                    let size = (reg_ty.bits() / 8) as u64;
-                    let size = std::cmp::max(size, 8);
+                    let size = if args_or_rets == ArgsOrRets::Rets && call_conv.extends_wasmtime() {
+                        size
+                    } else {
+                        std::cmp::max(size, 8)
+                    };
                    // Align.
                    debug_assert!(size.is_power_of_two());
                    next_stack = align_to(next_stack, size);
@@ -490,6 +500,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        flags: &settings::Flags,
        clobbers: &Set<Writable<RealReg>>,
        fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
    ) -> (u64, SmallVec<[Self::I; 16]>) {
        let mut insts = SmallVec::new();
        // Find all clobbered registers that are callee-save.
@@ -564,6 +575,7 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        flags: &settings::Flags,
        clobbers: &Set<Writable<RealReg>>,
        fixed_frame_storage_size: u32,
+        _outgoing_args_size: u32,
    ) -> SmallVec<[Self::I; 16]> {
        let mut insts = SmallVec::new();

@@ -824,15 +836,7 @@ impl From<StackAMode> for SyntheticAmode {
 }

 fn get_intreg_for_arg(call_conv: &CallConv, idx: usize, arg_idx: usize) -> Option<Reg> {
-    let is_fastcall = match call_conv {
-        CallConv::Fast
-        | CallConv::Cold
-        | CallConv::SystemV
-        | CallConv::BaldrdashSystemV
-        | CallConv::Baldrdash2020 => false,
-        CallConv::WindowsFastcall => true,
-        _ => panic!("int args only supported for SysV or Fastcall calling convention"),
-    };
+    let is_fastcall = call_conv.extends_windows_fastcall();

    // Fastcall counts by absolute argument number; SysV counts by argument of
    // this (integer) class.
@@ -853,15 +857,7 @@ fn get_intreg_for_arg(call_conv: &CallConv, idx: usize, arg_idx: usize) -> Optio
 }

 fn get_fltreg_for_arg(call_conv: &CallConv, idx: usize, arg_idx: usize) -> Option<Reg> {
-    let is_fastcall = match call_conv {
-        CallConv::Fast
-        | CallConv::Cold
-        | CallConv::SystemV
-        | CallConv::BaldrdashSystemV
-        | CallConv::Baldrdash2020 => false,
-        CallConv::WindowsFastcall => true,
-        _ => panic!("float args only supported for SysV or Fastcall calling convention"),
-    };
+    let is_fastcall = call_conv.extends_windows_fastcall();

    // Fastcall counts by absolute argument number; SysV counts by argument of
    // this (floating-point) class.
@@ -894,7 +890,10 @@ fn get_intreg_for_retval(
            1 => Some(regs::rdx()),
            _ => None,
        },
-        CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => {
+        CallConv::BaldrdashSystemV
+        | CallConv::Baldrdash2020
+        | CallConv::WasmtimeSystemV
+        | CallConv::WasmtimeFastcall => {
            if intreg_idx == 0 && retval_idx == 0 {
                Some(regs::rax())
            } else {
@@ -907,6 +906,7 @@ fn get_intreg_for_retval(
            _ => None,
        },
        CallConv::BaldrdashWindows | CallConv::Probestack => todo!(),
+        CallConv::AppleAarch64 => unreachable!(),
    }
 }

@@ -921,7 +921,10 @@ fn get_fltreg_for_retval(
            1 => Some(regs::xmm1()),
            _ => None,
        },
-        CallConv::BaldrdashSystemV | CallConv::Baldrdash2020 => {
+        CallConv::BaldrdashSystemV
+        | CallConv::Baldrdash2020
+        | CallConv::WasmtimeFastcall
+        | CallConv::WasmtimeSystemV => {
            if fltreg_idx == 0 && retval_idx == 0 {
                Some(regs::xmm0())
            } else {
@@ -933,6 +936,7 @@ fn get_fltreg_for_retval(
            _ => None,
        },
        CallConv::BaldrdashWindows | CallConv::Probestack => todo!(),
+        CallConv::AppleAarch64 => unreachable!(),
    }
 }

@@ -990,17 +994,18 @@ fn get_callee_saves(call_conv: &CallConv, regs: &Set<Writable<RealReg>>) -> Vec<
        CallConv::BaldrdashWindows => {
            todo!("baldrdash windows");
        }
-        CallConv::Fast | CallConv::Cold | CallConv::SystemV => regs
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV | CallConv::WasmtimeSystemV => regs
            .iter()
            .cloned()
            .filter(|r| is_callee_save_systemv(r.to_reg()))
            .collect(),
-        CallConv::WindowsFastcall => regs
+        CallConv::WindowsFastcall | CallConv::WasmtimeFastcall => regs
            .iter()
            .cloned()
            .filter(|r| is_callee_save_fastcall(r.to_reg()))
            .collect(),
        CallConv::Probestack => todo!("probestack?"),
+        CallConv::AppleAarch64 => unreachable!(),
    };
    // Sort registers for deterministic code output. We can do an unstable sort because the
    // registers will be unique (there are no dups).
--- a/cranelift/codegen/src/isa/x64/encoding/evex.rs
+++ b/cranelift/codegen/src/isa/x64/encoding/evex.rs
@@ -0,0 +1,403 @@
+//! Encodes EVEX instructions. These instructions are those added by the AVX-512 extensions. The
+//! EVEX encoding requires a 4-byte prefix:
+//!
+//! Byte 0:  0x62
+//!         ┌───┬───┬───┬───┬───┬───┬───┬───┐
+//! Byte 1: │ R │ X │ B │ R'│ 0 │ 0 │ m │ m │
+//!         ├───┼───┼───┼───┼───┼───┼───┼───┤
+//! Byte 2: │ W │ v │ v │ v │ v │ 1 │ p │ p │
+//!         ├───┼───┼───┼───┼───┼───┼───┼───┤
+//! Byte 3: │ z │ L'│ L │ b │ V'│ a │ a │ a │
+//!         └───┴───┴───┴───┴───┴───┴───┴───┘
+//!
+//! The prefix is then followeded by the opcode byte, the ModR/M byte, and other optional suffixes
+//! (e.g. SIB byte, displacements, immediates) based on the instruction (see section 2.6, Intel
+//! Software Development Manual, volume 2A).
+use super::rex::{encode_modrm, LegacyPrefixes, OpcodeMap};
+use super::ByteSink;
+use core::ops::RangeInclusive;
+
+/// Constructs an EVEX-encoded instruction using a builder pattern. This approach makes it visually
+/// easier to transform something the manual's syntax, `EVEX.256.66.0F38.W1 1F /r` to code:
+/// `EvexInstruction::new().length(...).prefix(...).map(...).w(true).opcode(0x1F).reg(...).rm(...)`.
+pub struct EvexInstruction {
+    bits: u32,
+    opcode: u8,
+    reg: Register,
+    rm: Register,
+}
+
+/// Because some of the bit flags in the EVEX prefix are reversed and users of `EvexInstruction` may
+/// choose to skip setting fields, here we set some sane defaults. Note that:
+/// - the first byte is always `0x62` but you will notice it at the end of the default `bits` value
+///   implemented--remember the little-endian order
+/// - some bits are always set to certain values: bits 10-11 to 0, bit 18 to 1
+/// - the other bits set correspond to reversed bits: R, X, B, R' (byte 1), vvvv (byte 2), V' (byte
+///   3).
+///
+/// See the `default_emission` test for what these defaults are equivalent to (e.g. using RAX,
+/// unsetting the W bit, etc.)
+impl Default for EvexInstruction {
+    fn default() -> Self {
+        Self {
+            bits: 0x08_7C_F0_62,
+            opcode: 0,
+            reg: Register::default(),
+            rm: Register::default(),
+        }
+    }
+}
+
+#[allow(non_upper_case_globals)] // This makes it easier to match the bit range names to the manual's names.
+impl EvexInstruction {
+    /// Construct a default EVEX instruction.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Set the length of the instruction . Note that there are sets of instructions (i.e. rounding,
+    /// memory broadcast) that modify the same underlying bits--at some point (TODO) we can add a
+    /// way to set those context bits and verify that both are not used (e.g. rounding AND length).
+    /// For now, this method is very convenient.
+    #[inline(always)]
+    pub fn length(mut self, length: EvexVectorLength) -> Self {
+        self.write(Self::LL, EvexContext::Other { length }.bits() as u32);
+        self
+    }
+
+    /// Set the legacy prefix byte of the instruction: None | 66 | F0 | F2 | F3. EVEX instructions
+    /// pack these into the prefix, not as separate bytes.
+    #[inline(always)]
+    pub fn prefix(mut self, prefix: LegacyPrefixes) -> Self {
+        self.write(Self::pp, prefix.bits() as u32);
+        self
+    }
+
+    /// Set the opcode map byte of the instruction: None | 0F | 0F38 | 0F3A. EVEX instructions pack
+    /// these into the prefix, not as separate bytes.
+    #[inline(always)]
+    pub fn map(mut self, map: OpcodeMap) -> Self {
+        self.write(Self::mm, map.bits() as u32);
+        self
+    }
+
+    /// Set the W bit, typically used to indicate an instruction using 64 bits of an operand (e.g.
+    /// 64 bit lanes). EVEX packs this bit in the EVEX prefix; previous encodings used the REX
+    /// prefix.
+    #[inline(always)]
+    pub fn w(mut self, w: bool) -> Self {
+        self.write(Self::W, w as u32);
+        self
+    }
+
+    /// Set the instruction opcode byte.
+    #[inline(always)]
+    pub fn opcode(mut self, opcode: u8) -> Self {
+        self.opcode = opcode;
+        self
+    }
+
+    /// Set the register to use for the `reg` bits; many instructions use this as the write operand.
+    /// Setting this affects both the ModRM byte (`reg` section) and the EVEX prefix (the extension
+    /// bits for register encodings > 8).
+    #[inline(always)]
+    pub fn reg(mut self, reg: impl Into<Register>) -> Self {
+        self.reg = reg.into();
+        let r = !(self.reg.0 >> 3) & 1;
+        let r_ = !(self.reg.0 >> 4) & 1;
+        self.write(Self::R, r as u32);
+        self.write(Self::R_, r_ as u32);
+        self
+    }
+
+    /// Set the mask to use. See section 2.6 in the Intel Software Developer's Manual, volume 2A for
+    /// more details.
+    #[allow(dead_code)]
+    #[inline(always)]
+    pub fn mask(mut self, mask: EvexMasking) -> Self {
+        self.write(Self::aaa, mask.aaa_bits() as u32);
+        self.write(Self::z, mask.z_bit() as u32);
+        self
+    }
+
+    /// Set the `vvvvv` register; some instructions allow using this as a second, non-destructive
+    /// source register in 3-operand instructions (e.g. 2 read, 1 write).
+    #[allow(dead_code)]
+    #[inline(always)]
+    pub fn vvvvv(mut self, reg: impl Into<Register>) -> Self {
+        let reg = reg.into();
+        self.write(Self::vvvv, !(reg.0 as u32) & 0b1111);
+        self.write(Self::V_, !(reg.0 as u32 >> 4) & 0b1);
+        self
+    }
+
+    /// Set the register to use for the `rm` bits; many instructions use this as the "read from
+    /// register/memory" operand. Currently this does not support memory addressing (TODO).Setting
+    /// this affects both the ModRM byte (`rm` section) and the EVEX prefix (the extension bits for
+    /// register encodings > 8).
+    #[inline(always)]
+    pub fn rm(mut self, reg: impl Into<Register>) -> Self {
+        self.rm = reg.into();
+        let b = !(self.rm.0 >> 3) & 1;
+        let x = !(self.rm.0 >> 4) & 1;
+        self.write(Self::X, x as u32);
+        self.write(Self::B, b as u32);
+        self
+    }
+
+    /// Emit the EVEX-encoded instruction to the code sink:
+    /// - first, the 4-byte EVEX prefix;
+    /// - then, the opcode byte;
+    /// - finally, the ModR/M byte.
+    ///
+    /// Eventually this method should support encodings of more than just the reg-reg addressing mode (TODO).
+    pub fn encode<CS: ByteSink + ?Sized>(&self, sink: &mut CS) {
+        sink.put4(self.bits);
+        sink.put1(self.opcode);
+        sink.put1(encode_modrm(3, self.reg.0 & 7, self.rm.0 & 7));
+    }
+
+    // In order to simplify the encoding of the various bit ranges in the prefix, we specify those
+    // ranges according to the table below (extracted from the Intel Software Development Manual,
+    // volume 2A). Remember that, because we pack the 4-byte prefix into a little-endian `u32`, this
+    // chart should be read from right-to-left, top-to-bottom. Note also that we start ranges at bit
+    // 8, leaving bits 0-7 for the mandatory `0x62`.
+    //         ┌───┬───┬───┬───┬───┬───┬───┬───┐
+    // Byte 1: │ R │ X │ B │ R'│ 0 │ 0 │ m │ m │
+    //         ├───┼───┼───┼───┼───┼───┼───┼───┤
+    // Byte 2: │ W │ v │ v │ v │ v │ 1 │ p │ p │
+    //         ├───┼───┼───┼───┼───┼───┼───┼───┤
+    // Byte 3: │ z │ L'│ L │ b │ V'│ a │ a │ a │
+    //         └───┴───┴───┴───┴───┴───┴───┴───┘
+
+    // Byte 1:
+    const mm: RangeInclusive<u8> = 8..=9;
+    const R_: RangeInclusive<u8> = 12..=12;
+    const B: RangeInclusive<u8> = 13..=13;
+    const X: RangeInclusive<u8> = 14..=14;
+    const R: RangeInclusive<u8> = 15..=15;
+
+    // Byte 2:
+    const pp: RangeInclusive<u8> = 16..=17;
+    const vvvv: RangeInclusive<u8> = 19..=22;
+    const W: RangeInclusive<u8> = 23..=23;
+
+    // Byte 3:
+    const aaa: RangeInclusive<u8> = 24..=26;
+    const V_: RangeInclusive<u8> = 27..=27;
+    #[allow(dead_code)] // Will be used once broadcast and rounding controls are exposed.
+    const b: RangeInclusive<u8> = 28..=28;
+    const LL: RangeInclusive<u8> = 29..=30;
+    const z: RangeInclusive<u8> = 31..=31;
+
+    // A convenience method for writing the `value` bits to the given range in `self.bits`.
+    #[inline]
+    fn write(&mut self, range: RangeInclusive<u8>, value: u32) {
+        assert!(ExactSizeIterator::len(&range) > 0);
+        let size = range.end() - range.start() + 1; // Calculate the number of bits in the range.
+        let mask: u32 = (1 << size) - 1; // Generate a bit mask.
+        debug_assert!(
+            value <= mask,
+            "The written value should have fewer than {} bits.",
+            size
+        );
+        let mask_complement = !(mask << *range.start()); // Create the bitwise complement for the clear mask.
+        self.bits &= mask_complement; // Clear the bits in `range`; otherwise the OR below may allow previously-set bits to slip through.
+        let value = value << *range.start(); // Place the value in the correct location (assumes `value <= mask`).
+        self.bits |= value; // Modify the bits in `range`.
+    }
+}
+
+/// Describe the register index to use. This wrapper is a type-safe way to pass
+/// around the registers defined in `inst/regs.rs`.
+#[derive(Copy, Clone, Default)]
+pub struct Register(u8);
+impl From<u8> for Register {
+    fn from(reg: u8) -> Self {
+        debug_assert!(reg < 16);
+        Self(reg)
+    }
+}
+impl Into<u8> for Register {
+    fn into(self) -> u8 {
+        self.0
+    }
+}
+
+/// Defines the EVEX context for the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte). Table 2-36 in
+/// section 2.6.10 (Intel Software Development Manual, volume 2A) describes how these bits can be
+/// used together for certain classes of instructions; i.e., special care should be taken to ensure
+/// that instructions use an applicable correct `EvexContext`. Table 2-39 contains cases where
+/// opcodes can result in an #UD.
+#[allow(dead_code, missing_docs)] // Rounding and broadcast modes are not yet used.
+pub enum EvexContext {
+    RoundingRegToRegFP {
+        rc: EvexRoundingControl,
+    },
+    NoRoundingFP {
+        sae: bool,
+        length: EvexVectorLength,
+    },
+    MemoryOp {
+        broadcast: bool,
+        length: EvexVectorLength,
+    },
+    Other {
+        length: EvexVectorLength,
+    },
+}
+
+impl Default for EvexContext {
+    fn default() -> Self {
+        Self::Other {
+            length: EvexVectorLength::default(),
+        }
+    }
+}
+
+impl EvexContext {
+    /// Encode the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte) for merging with the P2 byte.
+    pub fn bits(&self) -> u8 {
+        match self {
+            Self::RoundingRegToRegFP { rc } => 0b001 | rc.bits() << 1,
+            Self::NoRoundingFP { sae, length } => (*sae as u8) | length.bits() << 1,
+            Self::MemoryOp { broadcast, length } => (*broadcast as u8) | length.bits() << 1,
+            Self::Other { length } => length.bits() << 1,
+        }
+    }
+}
+
+/// The EVEX format allows choosing a vector length in the `L'` and `L` bits; see `EvexContext`.
+#[allow(dead_code, missing_docs)] // Wider-length vectors are not yet used.
+pub enum EvexVectorLength {
+    V128,
+    V256,
+    V512,
+}
+
+impl EvexVectorLength {
+    /// Encode the `L'` and `L` bits for merging with the P2 byte.
+    fn bits(&self) -> u8 {
+        match self {
+            Self::V128 => 0b00,
+            Self::V256 => 0b01,
+            Self::V512 => 0b10,
+            // 0b11 is reserved (#UD).
+        }
+    }
+}
+
+impl Default for EvexVectorLength {
+    fn default() -> Self {
+        Self::V128
+    }
+}
+
+/// The EVEX format allows defining rounding control in the `L'` and `L` bits; see `EvexContext`.
+#[allow(dead_code, missing_docs)] // Rounding controls are not yet used.
+pub enum EvexRoundingControl {
+    RNE,
+    RD,
+    RU,
+    RZ,
+}
+
+impl EvexRoundingControl {
+    /// Encode the `L'` and `L` bits for merging with the P2 byte.
+    fn bits(&self) -> u8 {
+        match self {
+            Self::RNE => 0b00,
+            Self::RD => 0b01,
+            Self::RU => 0b10,
+            Self::RZ => 0b11,
+        }
+    }
+}
+
+/// Defines the EVEX masking behavior; masking support is described in section 2.6.4 of the Intel
+/// Software Development Manual, volume 2A.
+#[allow(dead_code, missing_docs)] // Masking is not yet used.
+pub enum EvexMasking {
+    None,
+    Merging { k: u8 },
+    Zeroing { k: u8 },
+}
+
+impl Default for EvexMasking {
+    fn default() -> Self {
+        EvexMasking::None
+    }
+}
+
+impl EvexMasking {
+    /// Encode the `z` bit for merging with the P2 byte.
+    pub fn z_bit(&self) -> u8 {
+        match self {
+            Self::None | Self::Merging { .. } => 0,
+            Self::Zeroing { .. } => 1,
+        }
+    }
+
+    /// Encode the `aaa` bits for merging with the P2 byte.
+    pub fn aaa_bits(&self) -> u8 {
+        match self {
+            Self::None => 0b000,
+            Self::Merging { k } | Self::Zeroing { k } => {
+                debug_assert!(*k <= 7);
+                *k
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::isa::x64::inst::regs;
+    use std::vec::Vec;
+
+    // As a sanity test, we verify that the output of `xed-asmparse-main 'vpabsq xmm0{k0},
+    // xmm1'` matches this EVEX encoding machinery.
+    #[test]
+    fn vpabsq() {
+        let dst = regs::xmm0();
+        let src = regs::xmm1();
+        let mut sink0 = Vec::new();
+
+        EvexInstruction::new()
+            .prefix(LegacyPrefixes::_66)
+            .map(OpcodeMap::_0F38)
+            .w(true)
+            .opcode(0x1F)
+            .reg(dst.get_hw_encoding())
+            .rm(src.get_hw_encoding())
+            .length(EvexVectorLength::V128)
+            .encode(&mut sink0);
+
+        assert_eq!(sink0, vec![0x62, 0xf2, 0xfd, 0x08, 0x1f, 0xc1]);
+    }
+
+    /// Verify that the defaults are equivalent to an instruction with a `0x00` opcode using the
+    /// "0" register (i.e. `rax`), with sane defaults for the various configurable parameters. This
+    /// test is more interesting than it may appear because some of the parameters have flipped-bit
+    /// representations (e.g. `vvvvv`) so emitting 0s as a default will not work.
+    #[test]
+    fn default_emission() {
+        let mut sink0 = Vec::new();
+        EvexInstruction::new().encode(&mut sink0);
+
+        let mut sink1 = Vec::new();
+        EvexInstruction::new()
+            .length(EvexVectorLength::V128)
+            .prefix(LegacyPrefixes::None)
+            .map(OpcodeMap::None)
+            .w(false)
+            .opcode(0x00)
+            .reg(regs::rax().get_hw_encoding())
+            .rm(regs::rax().get_hw_encoding())
+            .mask(EvexMasking::None)
+            .encode(&mut sink1);
+
+        assert_eq!(sink0, sink1);
+    }
+}
--- a/cranelift/codegen/src/isa/x64/encoding/mod.rs
+++ b/cranelift/codegen/src/isa/x64/encoding/mod.rs
@@ -0,0 +1,60 @@
+//! Contains the encoding machinery for the various x64 instruction formats.
+use crate::{isa::x64, machinst::MachBuffer};
+use std::vec::Vec;
+
+pub mod evex;
+pub mod rex;
+pub mod vex;
+
+/// The encoding formats in this module all require a way of placing bytes into
+/// a buffer.
+pub trait ByteSink {
+    /// Add 1 byte to the code section.
+    fn put1(&mut self, _: u8);
+
+    /// Add 2 bytes to the code section.
+    fn put2(&mut self, _: u16);
+
+    /// Add 4 bytes to the code section.
+    fn put4(&mut self, _: u32);
+
+    /// Add 8 bytes to the code section.
+    fn put8(&mut self, _: u64);
+}
+
+impl ByteSink for MachBuffer<x64::inst::Inst> {
+    fn put1(&mut self, value: u8) {
+        self.put1(value)
+    }
+
+    fn put2(&mut self, value: u16) {
+        self.put2(value)
+    }
+
+    fn put4(&mut self, value: u32) {
+        self.put4(value)
+    }
+
+    fn put8(&mut self, value: u64) {
+        self.put8(value)
+    }
+}
+
+/// Provide a convenient implementation for testing.
+impl ByteSink for Vec<u8> {
+    fn put1(&mut self, v: u8) {
+        self.extend_from_slice(&[v])
+    }
+
+    fn put2(&mut self, v: u16) {
+        self.extend_from_slice(&v.to_le_bytes())
+    }
+
+    fn put4(&mut self, v: u32) {
+        self.extend_from_slice(&v.to_le_bytes())
+    }
+
+    fn put8(&mut self, v: u64) {
+        self.extend_from_slice(&v.to_le_bytes())
+    }
+}
--- a/cranelift/codegen/src/isa/x64/encoding/rex.rs
+++ b/cranelift/codegen/src/isa/x64/encoding/rex.rs
@@ -0,0 +1,504 @@
+//! Encodes instructions in the standard x86 encoding mode. This is called IA-32E mode in the Intel
+//! manuals but corresponds to the addition of the REX-prefix format (hence the name of this module)
+//! that allowed encoding instructions in both compatibility mode (32-bit instructions running on a
+//! 64-bit OS) and in 64-bit mode (using the full 64-bit address space).
+//!
+//! For all of the routines that take both a memory-or-reg operand (sometimes called "E" in the
+//! Intel documentation, see the Intel Developer's manual, vol. 2, section A.2) and a reg-only
+//! operand ("G" in Intelese), the order is always G first, then E. The term "enc" in the following
+//! means "hardware register encoding number".
+
+use crate::{
+    ir::TrapCode,
+    isa::x64::inst::{
+        args::{Amode, OperandSize},
+        regs, EmitInfo, EmitState, Inst, LabelUse,
+    },
+    machinst::{MachBuffer, MachInstEmitInfo},
+};
+use regalloc::{Reg, RegClass};
+
+pub(crate) fn low8_will_sign_extend_to_64(x: u32) -> bool {
+    let xs = (x as i32) as i64;
+    xs == ((xs << 56) >> 56)
+}
+
+pub(crate) fn low8_will_sign_extend_to_32(x: u32) -> bool {
+    let xs = x as i32;
+    xs == ((xs << 24) >> 24)
+}
+
+/// Encode the ModR/M byte.
+#[inline(always)]
+pub fn encode_modrm(m0d: u8, enc_reg_g: u8, rm_e: u8) -> u8 {
+    debug_assert!(m0d < 4);
+    debug_assert!(enc_reg_g < 8);
+    debug_assert!(rm_e < 8);
+    ((m0d & 3) << 6) | ((enc_reg_g & 7) << 3) | (rm_e & 7)
+}
+
+#[inline(always)]
+pub(crate) fn encode_sib(shift: u8, enc_index: u8, enc_base: u8) -> u8 {
+    debug_assert!(shift < 4);
+    debug_assert!(enc_index < 8);
+    debug_assert!(enc_base < 8);
+    ((shift & 3) << 6) | ((enc_index & 7) << 3) | (enc_base & 7)
+}
+
+/// Get the encoding number of a GPR.
+#[inline(always)]
+pub(crate) fn int_reg_enc(reg: Reg) -> u8 {
+    debug_assert!(reg.is_real());
+    debug_assert_eq!(reg.get_class(), RegClass::I64);
+    reg.get_hw_encoding()
+}
+
+/// Get the encoding number of any register.
+#[inline(always)]
+pub(crate) fn reg_enc(reg: Reg) -> u8 {
+    debug_assert!(reg.is_real());
+    reg.get_hw_encoding()
+}
+
+/// A small bit field to record a REX prefix specification:
+/// - bit 0 set to 1 indicates REX.W must be 0 (cleared).
+/// - bit 1 set to 1 indicates the REX prefix must always be emitted.
+#[repr(transparent)]
+#[derive(Clone, Copy)]
+pub(crate) struct RexFlags(u8);
+
+impl RexFlags {
+    /// By default, set the W field, and don't always emit.
+    #[inline(always)]
+    pub(crate) fn set_w() -> Self {
+        Self(0)
+    }
+    /// Creates a new RexPrefix for which the REX.W bit will be cleared.
+    #[inline(always)]
+    pub(crate) fn clear_w() -> Self {
+        Self(1)
+    }
+
+    #[inline(always)]
+    pub(crate) fn always_emit(&mut self) -> &mut Self {
+        self.0 = self.0 | 2;
+        self
+    }
+
+    #[inline(always)]
+    pub(crate) fn always_emit_if_8bit_needed(&mut self, reg: Reg) -> &mut Self {
+        let enc_reg = int_reg_enc(reg);
+        if enc_reg >= 4 && enc_reg <= 7 {
+            self.always_emit();
+        }
+        self
+    }
+
+    #[inline(always)]
+    pub(crate) fn must_clear_w(&self) -> bool {
+        (self.0 & 1) != 0
+    }
+    #[inline(always)]
+    pub(crate) fn must_always_emit(&self) -> bool {
+        (self.0 & 2) != 0
+    }
+
+    #[inline(always)]
+    pub(crate) fn emit_two_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_e: u8) {
+        let w = if self.must_clear_w() { 0 } else { 1 };
+        let r = (enc_g >> 3) & 1;
+        let x = 0;
+        let b = (enc_e >> 3) & 1;
+        let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+        if rex != 0x40 || self.must_always_emit() {
+            sink.put1(rex);
+        }
+    }
+
+    #[inline(always)]
+    pub fn emit_three_op(
+        &self,
+        sink: &mut MachBuffer<Inst>,
+        enc_g: u8,
+        enc_index: u8,
+        enc_base: u8,
+    ) {
+        let w = if self.must_clear_w() { 0 } else { 1 };
+        let r = (enc_g >> 3) & 1;
+        let x = (enc_index >> 3) & 1;
+        let b = (enc_base >> 3) & 1;
+        let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
+        if rex != 0x40 || self.must_always_emit() {
+            sink.put1(rex);
+        }
+    }
+}
+
+/// Generate the proper Rex flags for the given operand size.
+impl From<OperandSize> for RexFlags {
+    fn from(size: OperandSize) -> Self {
+        match size {
+            OperandSize::Size64 => RexFlags::set_w(),
+            _ => RexFlags::clear_w(),
+        }
+    }
+}
+/// Generate Rex flags for an OperandSize/register tuple.
+impl From<(OperandSize, Reg)> for RexFlags {
+    fn from((size, reg): (OperandSize, Reg)) -> Self {
+        let mut rex = RexFlags::from(size);
+        if size == OperandSize::Size8 {
+            rex.always_emit_if_8bit_needed(reg);
+        }
+        rex
+    }
+}
+
+/// Allows using the same opcode byte in different "opcode maps" to allow for more instruction
+/// encodings. See appendix A in the Intel Software Developer's Manual, volume 2A, for more details.
+#[allow(missing_docs)]
+pub enum OpcodeMap {
+    None,
+    _0F,
+    _0F38,
+    _0F3A,
+}
+
+impl OpcodeMap {
+    /// Normally the opcode map is specified as bytes in the instruction, but some x64 encoding
+    /// formats pack this information as bits in a prefix (e.g. EVEX).
+    pub(crate) fn bits(&self) -> u8 {
+        match self {
+            OpcodeMap::None => 0b00,
+            OpcodeMap::_0F => 0b01,
+            OpcodeMap::_0F38 => 0b10,
+            OpcodeMap::_0F3A => 0b11,
+        }
+    }
+}
+
+impl Default for OpcodeMap {
+    fn default() -> Self {
+        Self::None
+    }
+}
+
+/// We may need to include one or more legacy prefix bytes before the REX prefix.  This enum
+/// covers only the small set of possibilities that we actually need.
+pub enum LegacyPrefixes {
+    /// No prefix bytes.
+    None,
+    /// Operand Size Override -- here, denoting "16-bit operation".
+    _66,
+    /// The Lock prefix.
+    _F0,
+    /// Operand size override and Lock.
+    _66F0,
+    /// REPNE, but no specific meaning here -- is just an opcode extension.
+    _F2,
+    /// REP/REPE, but no specific meaning here -- is just an opcode extension.
+    _F3,
+    /// Operand size override and same effect as F3.
+    _66F3,
+}
+
+impl LegacyPrefixes {
+    /// Emit the legacy prefix as bytes (e.g. in REX instructions).
+    #[inline(always)]
+    pub(crate) fn emit(&self, sink: &mut MachBuffer<Inst>) {
+        match self {
+            Self::_66 => sink.put1(0x66),
+            Self::_F0 => sink.put1(0xF0),
+            Self::_66F0 => {
+                // I don't think the order matters, but in any case, this is the same order that
+                // the GNU assembler uses.
+                sink.put1(0x66);
+                sink.put1(0xF0);
+            }
+            Self::_F2 => sink.put1(0xF2),
+            Self::_F3 => sink.put1(0xF3),
+            Self::_66F3 => {
+                sink.put1(0x66);
+                sink.put1(0xF3);
+            }
+            Self::None => (),
+        }
+    }
+
+    /// Emit the legacy prefix as bits (e.g. for EVEX instructions).
+    #[inline(always)]
+    pub(crate) fn bits(&self) -> u8 {
+        match self {
+            Self::None => 0b00,
+            Self::_66 => 0b01,
+            Self::_F3 => 0b10,
+            Self::_F2 => 0b11,
+            _ => panic!(
+                "VEX and EVEX bits can only be extracted from single prefixes: None, 66, F3, F2"
+            ),
+        }
+    }
+}
+
+impl Default for LegacyPrefixes {
+    fn default() -> Self {
+        Self::None
+    }
+}
+
+/// This is the core 'emit' function for instructions that reference memory.
+///
+/// For an instruction that has as operands a reg encoding `enc_g` and a memory address `mem_e`,
+/// create and emit:
+/// - first the legacy prefixes, if any
+/// - then the REX prefix, if needed
+/// - then caller-supplied opcode byte(s) (`opcodes` and `num_opcodes`),
+/// - then the MOD/RM byte,
+/// - then optionally, a SIB byte,
+/// - and finally optionally an immediate that will be derived from the `mem_e` operand.
+///
+/// For most instructions up to and including SSE4.2, that will be the whole instruction: this is
+/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX-prefixed
+/// instructions will require their own emitter functions.
+///
+/// This will also work for 32-bits x86 instructions, assuming no REX prefix is provided.
+///
+/// The opcodes are written bigendianly for the convenience of callers.  For example, if the opcode
+/// bytes to be emitted are, in this order, F3 0F 27, then the caller should pass `opcodes` ==
+/// 0xF3_0F_27 and `num_opcodes` == 3.
+///
+/// The register operand is represented here not as a `Reg` but as its hardware encoding, `enc_g`.
+/// `rex` can specify special handling for the REX prefix.  By default, the REX prefix will
+/// indicate a 64-bit operation and will be deleted if it is redundant (0x40).  Note that for a
+/// 64-bit operation, the REX prefix will normally never be redundant, since REX.W must be 1 to
+/// indicate a 64-bit operation.
+pub(crate) fn emit_std_enc_mem(
+    sink: &mut MachBuffer<Inst>,
+    state: &EmitState,
+    info: &EmitInfo,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    mut num_opcodes: usize,
+    enc_g: u8,
+    mem_e: &Amode,
+    rex: RexFlags,
+) {
+    // General comment for this function: the registers in `mem_e` must be
+    // 64-bit integer registers, because they are part of an address
+    // expression.  But `enc_g` can be derived from a register of any class.
+
+    let srcloc = state.cur_srcloc();
+    let can_trap = mem_e.can_trap();
+    if can_trap {
+        sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+    }
+
+    prefixes.emit(sink);
+
+    match mem_e {
+        Amode::ImmReg { simm32, base, .. } => {
+            // If this is an access based off of RSP, it may trap with a stack overflow if it's the
+            // first touch of a new stack page.
+            if *base == regs::rsp() && !can_trap && info.flags().enable_probestack() {
+                sink.add_trap(srcloc, TrapCode::StackOverflow);
+            }
+
+            // First, the REX byte.
+            let enc_e = int_reg_enc(*base);
+            rex.emit_two_op(sink, enc_g, enc_e);
+
+            // Now the opcode(s).  These include any other prefixes the caller
+            // hands to us.
+            while num_opcodes > 0 {
+                num_opcodes -= 1;
+                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+            }
+
+            // Now the mod/rm and associated immediates.  This is
+            // significantly complicated due to the multiple special cases.
+            if *simm32 == 0
+                && enc_e != regs::ENC_RSP
+                && enc_e != regs::ENC_RBP
+                && enc_e != regs::ENC_R12
+                && enc_e != regs::ENC_R13
+            {
+                // FIXME JRS 2020Feb11: those four tests can surely be
+                // replaced by a single mask-and-compare check.  We should do
+                // that because this routine is likely to be hot.
+                sink.put1(encode_modrm(0, enc_g & 7, enc_e & 7));
+            } else if *simm32 == 0 && (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) {
+                sink.put1(encode_modrm(0, enc_g & 7, 4));
+                sink.put1(0x24);
+            } else if low8_will_sign_extend_to_32(*simm32)
+                && enc_e != regs::ENC_RSP
+                && enc_e != regs::ENC_R12
+            {
+                sink.put1(encode_modrm(1, enc_g & 7, enc_e & 7));
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if enc_e != regs::ENC_RSP && enc_e != regs::ENC_R12 {
+                sink.put1(encode_modrm(2, enc_g & 7, enc_e & 7));
+                sink.put4(*simm32);
+            } else if (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12)
+                && low8_will_sign_extend_to_32(*simm32)
+            {
+                // REX.B distinguishes RSP from R12
+                sink.put1(encode_modrm(1, enc_g & 7, 4));
+                sink.put1(0x24);
+                sink.put1((simm32 & 0xFF) as u8);
+            } else if enc_e == regs::ENC_R12 || enc_e == regs::ENC_RSP {
+                //.. wait for test case for RSP case
+                // REX.B distinguishes RSP from R12
+                sink.put1(encode_modrm(2, enc_g & 7, 4));
+                sink.put1(0x24);
+                sink.put4(*simm32);
+            } else {
+                unreachable!("ImmReg");
+            }
+        }
+
+        Amode::ImmRegRegShift {
+            simm32,
+            base: reg_base,
+            index: reg_index,
+            shift,
+            ..
+        } => {
+            // If this is an access based off of RSP, it may trap with a stack overflow if it's the
+            // first touch of a new stack page.
+            if *reg_base == regs::rsp() && !can_trap && info.flags().enable_probestack() {
+                sink.add_trap(srcloc, TrapCode::StackOverflow);
+            }
+
+            let enc_base = int_reg_enc(*reg_base);
+            let enc_index = int_reg_enc(*reg_index);
+
+            // The rex byte.
+            rex.emit_three_op(sink, enc_g, enc_index, enc_base);
+
+            // All other prefixes and opcodes.
+            while num_opcodes > 0 {
+                num_opcodes -= 1;
+                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+            }
+
+            // modrm, SIB, immediates.
+            if low8_will_sign_extend_to_32(*simm32) && enc_index != regs::ENC_RSP {
+                sink.put1(encode_modrm(1, enc_g & 7, 4));
+                sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7));
+                sink.put1(*simm32 as u8);
+            } else if enc_index != regs::ENC_RSP {
+                sink.put1(encode_modrm(2, enc_g & 7, 4));
+                sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7));
+                sink.put4(*simm32);
+            } else {
+                panic!("ImmRegRegShift");
+            }
+        }
+
+        Amode::RipRelative { ref target } => {
+            // First, the REX byte, with REX.B = 0.
+            rex.emit_two_op(sink, enc_g, 0);
+
+            // Now the opcode(s).  These include any other prefixes the caller
+            // hands to us.
+            while num_opcodes > 0 {
+                num_opcodes -= 1;
+                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+            }
+
+            // RIP-relative is mod=00, rm=101.
+            sink.put1(encode_modrm(0, enc_g & 7, 0b101));
+
+            let offset = sink.cur_offset();
+            sink.use_label_at_offset(offset, *target, LabelUse::JmpRel32);
+            sink.put4(0);
+        }
+    }
+}
+
+/// This is the core 'emit' function for instructions that do not reference memory.
+///
+/// This is conceptually the same as emit_modrm_sib_enc_ge, except it is for the case where the E
+/// operand is a register rather than memory.  Hence it is much simpler.
+pub(crate) fn emit_std_enc_enc(
+    sink: &mut MachBuffer<Inst>,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    mut num_opcodes: usize,
+    enc_g: u8,
+    enc_e: u8,
+    rex: RexFlags,
+) {
+    // EncG and EncE can be derived from registers of any class, and they
+    // don't even have to be from the same class.  For example, for an
+    // integer-to-FP conversion insn, one might be RegClass::I64 and the other
+    // RegClass::V128.
+
+    // The legacy prefixes.
+    prefixes.emit(sink);
+
+    // The rex byte.
+    rex.emit_two_op(sink, enc_g, enc_e);
+
+    // All other prefixes and opcodes.
+    while num_opcodes > 0 {
+        num_opcodes -= 1;
+        sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
+    }
+
+    // Now the mod/rm byte.  The instruction we're generating doesn't access
+    // memory, so there is no SIB byte or immediate -- we're done.
+    sink.put1(encode_modrm(3, enc_g & 7, enc_e & 7));
+}
+
+// These are merely wrappers for the above two functions that facilitate passing
+// actual `Reg`s rather than their encodings.
+
+pub(crate) fn emit_std_reg_mem(
+    sink: &mut MachBuffer<Inst>,
+    state: &EmitState,
+    info: &EmitInfo,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    num_opcodes: usize,
+    reg_g: Reg,
+    mem_e: &Amode,
+    rex: RexFlags,
+) {
+    let enc_g = reg_enc(reg_g);
+    emit_std_enc_mem(
+        sink,
+        state,
+        info,
+        prefixes,
+        opcodes,
+        num_opcodes,
+        enc_g,
+        mem_e,
+        rex,
+    );
+}
+
+pub(crate) fn emit_std_reg_reg(
+    sink: &mut MachBuffer<Inst>,
+    prefixes: LegacyPrefixes,
+    opcodes: u32,
+    num_opcodes: usize,
+    reg_g: Reg,
+    reg_e: Reg,
+    rex: RexFlags,
+) {
+    let enc_g = reg_enc(reg_g);
+    let enc_e = reg_enc(reg_e);
+    emit_std_enc_enc(sink, prefixes, opcodes, num_opcodes, enc_g, enc_e, rex);
+}
+
+/// Write a suitable number of bits from an imm64 to the sink.
+pub(crate) fn emit_simm(sink: &mut MachBuffer<Inst>, size: u8, simm32: u32) {
+    match size {
+        8 | 4 => sink.put4(simm32),
+        2 => sink.put2(simm32 as u16),
+        1 => sink.put1(simm32 as u8),
+        _ => unreachable!(),
+    }
+}
--- a/cranelift/codegen/src/isa/x64/encoding/vex.rs
+++ b/cranelift/codegen/src/isa/x64/encoding/vex.rs
@@ -0,0 +1,2 @@
+//! Encodes VEX instructions. These instructions are those added by the Advanced Vector Extensions
+//! (AVX).
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -10,6 +10,7 @@ use regalloc::{
    PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector,
    RegUsageMapper, Writable,
 };
+use smallvec::{smallvec, SmallVec};
 use std::fmt;
 use std::string::String;

@@ -411,12 +412,12 @@ pub enum UnaryRmROpcode {
 }

 impl UnaryRmROpcode {
-    pub(crate) fn available_from(&self) -> Option<InstructionSet> {
+    pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
        match self {
-            UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => None,
-            UnaryRmROpcode::Lzcnt => Some(InstructionSet::Lzcnt),
-            UnaryRmROpcode::Tzcnt => Some(InstructionSet::BMI1),
-            UnaryRmROpcode::Popcnt => Some(InstructionSet::Popcnt),
+            UnaryRmROpcode::Bsr | UnaryRmROpcode::Bsf => smallvec![],
+            UnaryRmROpcode::Lzcnt => smallvec![InstructionSet::Lzcnt],
+            UnaryRmROpcode::Tzcnt => smallvec![InstructionSet::BMI1],
+            UnaryRmROpcode::Popcnt => smallvec![InstructionSet::Popcnt],
        }
    }
 }
@@ -447,6 +448,7 @@ pub enum CmpOpcode {
    Test,
 }

+#[derive(Debug)]
 pub(crate) enum InstructionSet {
    SSE,
    SSE2,
@@ -458,10 +460,13 @@ pub(crate) enum InstructionSet {
    BMI1,
    #[allow(dead_code)] // never constructed (yet).
    BMI2,
+    AVX512F,
+    AVX512VL,
 }

 /// Some SSE operations requiring 2 operands r/m and r.
 #[derive(Clone, Copy, PartialEq)]
+#[allow(dead_code)] // some variants here aren't used just yet
 pub enum SseOpcode {
    Addps,
    Addpd,
@@ -479,6 +484,7 @@ pub enum SseOpcode {
    Cmpss,
    Cmpsd,
    Cvtdq2ps,
+    Cvtdq2pd,
    Cvtsd2ss,
    Cvtsd2si,
    Cvtsi2ss,
@@ -672,6 +678,7 @@ impl SseOpcode {
            | SseOpcode::Cmpsd
            | SseOpcode::Comisd
            | SseOpcode::Cvtdq2ps
+            | SseOpcode::Cvtdq2pd
            | SseOpcode::Cvtsd2ss
            | SseOpcode::Cvtsd2si
            | SseOpcode::Cvtsi2sd
@@ -827,6 +834,7 @@ impl fmt::Debug for SseOpcode {
            SseOpcode::Comiss => "comiss",
            SseOpcode::Comisd => "comisd",
            SseOpcode::Cvtdq2ps => "cvtdq2ps",
+            SseOpcode::Cvtdq2pd => "cvtdq2pd",
            SseOpcode::Cvtsd2ss => "cvtsd2ss",
            SseOpcode::Cvtsd2si => "cvtsd2si",
            SseOpcode::Cvtsi2ss => "cvtsi2ss",
@@ -983,6 +991,35 @@ impl fmt::Display for SseOpcode {
    }
 }

+#[derive(Clone)]
+pub enum Avx512Opcode {
+    Vpabsq,
+}
+
+impl Avx512Opcode {
+    /// Which `InstructionSet`s support the opcode?
+    pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
+        match self {
+            Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
+        }
+    }
+}
+
+impl fmt::Debug for Avx512Opcode {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        let name = match self {
+            Avx512Opcode::Vpabsq => "vpabsq",
+        };
+        write!(fmt, "{}", name)
+    }
+}
+
+impl fmt::Display for Avx512Opcode {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(self, f)
+    }
+}
+
 /// This defines the ways a value can be extended: either signed- or zero-extension, or none for
 /// types that are not extended. Contrast with [ExtMode], which defines the widths from and to which
 /// values can be extended.
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2,449 +2,18 @@ use crate::binemit::{Addend, Reloc};
 use crate::ir::immediates::{Ieee32, Ieee64};
 use crate::ir::LibCall;
 use crate::ir::TrapCode;
+use crate::isa::x64::encoding::evex::{EvexInstruction, EvexVectorLength};
+use crate::isa::x64::encoding::rex::{
+    emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
+    low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap,
+    RexFlags,
+};
 use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
 use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel};
 use core::convert::TryInto;
 use log::debug;
-use regalloc::{Reg, RegClass, Writable};
-
-fn low8_will_sign_extend_to_64(x: u32) -> bool {
-    let xs = (x as i32) as i64;
-    xs == ((xs << 56) >> 56)
-}
-
-fn low8_will_sign_extend_to_32(x: u32) -> bool {
-    let xs = x as i32;
-    xs == ((xs << 24) >> 24)
-}
-
-//=============================================================================
-// Instructions and subcomponents: emission
-
-// For all of the routines that take both a memory-or-reg operand (sometimes
-// called "E" in the Intel documentation) and a reg-only operand ("G" in
-// Intelese), the order is always G first, then E.
-//
-// "enc" in the following means "hardware register encoding number".
-
-#[inline(always)]
-fn encode_modrm(m0d: u8, enc_reg_g: u8, rm_e: u8) -> u8 {
-    debug_assert!(m0d < 4);
-    debug_assert!(enc_reg_g < 8);
-    debug_assert!(rm_e < 8);
-    ((m0d & 3) << 6) | ((enc_reg_g & 7) << 3) | (rm_e & 7)
-}
-
-#[inline(always)]
-fn encode_sib(shift: u8, enc_index: u8, enc_base: u8) -> u8 {
-    debug_assert!(shift < 4);
-    debug_assert!(enc_index < 8);
-    debug_assert!(enc_base < 8);
-    ((shift & 3) << 6) | ((enc_index & 7) << 3) | (enc_base & 7)
-}
-
-/// Get the encoding number of a GPR.
-#[inline(always)]
-fn int_reg_enc(reg: Reg) -> u8 {
-    debug_assert!(reg.is_real());
-    debug_assert_eq!(reg.get_class(), RegClass::I64);
-    reg.get_hw_encoding()
-}
-
-/// Get the encoding number of any register.
-#[inline(always)]
-fn reg_enc(reg: Reg) -> u8 {
-    debug_assert!(reg.is_real());
-    reg.get_hw_encoding()
-}
-
-/// A small bit field to record a REX prefix specification:
-/// - bit 0 set to 1 indicates REX.W must be 0 (cleared).
-/// - bit 1 set to 1 indicates the REX prefix must always be emitted.
-#[repr(transparent)]
-#[derive(Clone, Copy)]
-struct RexFlags(u8);
-
-impl RexFlags {
-    /// By default, set the W field, and don't always emit.
-    #[inline(always)]
-    fn set_w() -> Self {
-        Self(0)
-    }
-    /// Creates a new RexPrefix for which the REX.W bit will be cleared.
-    #[inline(always)]
-    fn clear_w() -> Self {
-        Self(1)
-    }
-
-    #[inline(always)]
-    fn always_emit(&mut self) -> &mut Self {
-        self.0 = self.0 | 2;
-        self
-    }
-
-    #[inline(always)]
-    fn always_emit_if_8bit_needed(&mut self, reg: Reg) -> &mut Self {
-        let enc_reg = int_reg_enc(reg);
-        if enc_reg >= 4 && enc_reg <= 7 {
-            self.always_emit();
-        }
-        self
-    }
-
-    #[inline(always)]
-    fn must_clear_w(&self) -> bool {
-        (self.0 & 1) != 0
-    }
-    #[inline(always)]
-    fn must_always_emit(&self) -> bool {
-        (self.0 & 2) != 0
-    }
-
-    #[inline(always)]
-    fn emit_two_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_e: u8) {
-        let w = if self.must_clear_w() { 0 } else { 1 };
-        let r = (enc_g >> 3) & 1;
-        let x = 0;
-        let b = (enc_e >> 3) & 1;
-        let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
-        if rex != 0x40 || self.must_always_emit() {
-            sink.put1(rex);
-        }
-    }
-
-    #[inline(always)]
-    fn emit_three_op(&self, sink: &mut MachBuffer<Inst>, enc_g: u8, enc_index: u8, enc_base: u8) {
-        let w = if self.must_clear_w() { 0 } else { 1 };
-        let r = (enc_g >> 3) & 1;
-        let x = (enc_index >> 3) & 1;
-        let b = (enc_base >> 3) & 1;
-        let rex = 0x40 | (w << 3) | (r << 2) | (x << 1) | b;
-        if rex != 0x40 || self.must_always_emit() {
-            sink.put1(rex);
-        }
-    }
-}
-
-/// Generate the proper Rex flags for the given operand size.
-impl From<OperandSize> for RexFlags {
-    fn from(size: OperandSize) -> Self {
-        match size {
-            OperandSize::Size64 => RexFlags::set_w(),
-            _ => RexFlags::clear_w(),
-        }
-    }
-}
-/// Generate Rex flags for an OperandSize/register tuple.
-impl From<(OperandSize, Reg)> for RexFlags {
-    fn from((size, reg): (OperandSize, Reg)) -> Self {
-        let mut rex = RexFlags::from(size);
-        if size == OperandSize::Size8 {
-            rex.always_emit_if_8bit_needed(reg);
-        }
-        rex
-    }
-}
-
-/// We may need to include one or more legacy prefix bytes before the REX prefix.  This enum
-/// covers only the small set of possibilities that we actually need.
-enum LegacyPrefixes {
-    /// No prefix bytes.
-    None,
-    /// Operand Size Override -- here, denoting "16-bit operation".
-    _66,
-    /// The Lock prefix.
-    _F0,
-    /// Operand size override and Lock.
-    _66F0,
-    /// REPNE, but no specific meaning here -- is just an opcode extension.
-    _F2,
-    /// REP/REPE, but no specific meaning here -- is just an opcode extension.
-    _F3,
-    /// Operand size override and same effect as F3.
-    _66F3,
-}
-
-impl LegacyPrefixes {
-    #[inline(always)]
-    fn emit(&self, sink: &mut MachBuffer<Inst>) {
-        match self {
-            LegacyPrefixes::_66 => sink.put1(0x66),
-            LegacyPrefixes::_F0 => sink.put1(0xF0),
-            LegacyPrefixes::_66F0 => {
-                // I don't think the order matters, but in any case, this is the same order that
-                // the GNU assembler uses.
-                sink.put1(0x66);
-                sink.put1(0xF0);
-            }
-            LegacyPrefixes::_F2 => sink.put1(0xF2),
-            LegacyPrefixes::_F3 => sink.put1(0xF3),
-            LegacyPrefixes::_66F3 => {
-                sink.put1(0x66);
-                sink.put1(0xF3);
-            }
-            LegacyPrefixes::None => (),
-        }
-    }
-}
-
-/// This is the core 'emit' function for instructions that reference memory.
-///
-/// For an instruction that has as operands a reg encoding `enc_g` and a memory address `mem_e`,
-/// create and emit:
-/// - first the legacy prefixes, if any
-/// - then the REX prefix, if needed
-/// - then caller-supplied opcode byte(s) (`opcodes` and `num_opcodes`),
-/// - then the MOD/RM byte,
-/// - then optionally, a SIB byte,
-/// - and finally optionally an immediate that will be derived from the `mem_e` operand.
-///
-/// For most instructions up to and including SSE4.2, that will be the whole instruction: this is
-/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX-prefixed
-/// instructions will require their own emitter functions.
-///
-/// This will also work for 32-bits x86 instructions, assuming no REX prefix is provided.
-///
-/// The opcodes are written bigendianly for the convenience of callers.  For example, if the opcode
-/// bytes to be emitted are, in this order, F3 0F 27, then the caller should pass `opcodes` ==
-/// 0xF3_0F_27 and `num_opcodes` == 3.
-///
-/// The register operand is represented here not as a `Reg` but as its hardware encoding, `enc_g`.
-/// `rex` can specify special handling for the REX prefix.  By default, the REX prefix will
-/// indicate a 64-bit operation and will be deleted if it is redundant (0x40).  Note that for a
-/// 64-bit operation, the REX prefix will normally never be redundant, since REX.W must be 1 to
-/// indicate a 64-bit operation.
-fn emit_std_enc_mem(
-    sink: &mut MachBuffer<Inst>,
-    state: &EmitState,
-    info: &EmitInfo,
-    prefixes: LegacyPrefixes,
-    opcodes: u32,
-    mut num_opcodes: usize,
-    enc_g: u8,
-    mem_e: &Amode,
-    rex: RexFlags,
-) {
-    // General comment for this function: the registers in `mem_e` must be
-    // 64-bit integer registers, because they are part of an address
-    // expression.  But `enc_g` can be derived from a register of any class.
-
-    let srcloc = state.cur_srcloc();
-    let can_trap = mem_e.can_trap();
-    if can_trap {
-        sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
-    }
-
-    prefixes.emit(sink);
-
-    match mem_e {
-        Amode::ImmReg { simm32, base, .. } => {
-            // If this is an access based off of RSP, it may trap with a stack overflow if it's the
-            // first touch of a new stack page.
-            if *base == regs::rsp() && !can_trap && info.flags().enable_probestack() {
-                sink.add_trap(srcloc, TrapCode::StackOverflow);
-            }
-
-            // First, the REX byte.
-            let enc_e = int_reg_enc(*base);
-            rex.emit_two_op(sink, enc_g, enc_e);
-
-            // Now the opcode(s).  These include any other prefixes the caller
-            // hands to us.
-            while num_opcodes > 0 {
-                num_opcodes -= 1;
-                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
-            }
-
-            // Now the mod/rm and associated immediates.  This is
-            // significantly complicated due to the multiple special cases.
-            if *simm32 == 0
-                && enc_e != regs::ENC_RSP
-                && enc_e != regs::ENC_RBP
-                && enc_e != regs::ENC_R12
-                && enc_e != regs::ENC_R13
-            {
-                // FIXME JRS 2020Feb11: those four tests can surely be
-                // replaced by a single mask-and-compare check.  We should do
-                // that because this routine is likely to be hot.
-                sink.put1(encode_modrm(0, enc_g & 7, enc_e & 7));
-            } else if *simm32 == 0 && (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12) {
-                sink.put1(encode_modrm(0, enc_g & 7, 4));
-                sink.put1(0x24);
-            } else if low8_will_sign_extend_to_32(*simm32)
-                && enc_e != regs::ENC_RSP
-                && enc_e != regs::ENC_R12
-            {
-                sink.put1(encode_modrm(1, enc_g & 7, enc_e & 7));
-                sink.put1((simm32 & 0xFF) as u8);
-            } else if enc_e != regs::ENC_RSP && enc_e != regs::ENC_R12 {
-                sink.put1(encode_modrm(2, enc_g & 7, enc_e & 7));
-                sink.put4(*simm32);
-            } else if (enc_e == regs::ENC_RSP || enc_e == regs::ENC_R12)
-                && low8_will_sign_extend_to_32(*simm32)
-            {
-                // REX.B distinguishes RSP from R12
-                sink.put1(encode_modrm(1, enc_g & 7, 4));
-                sink.put1(0x24);
-                sink.put1((simm32 & 0xFF) as u8);
-            } else if enc_e == regs::ENC_R12 || enc_e == regs::ENC_RSP {
-                //.. wait for test case for RSP case
-                // REX.B distinguishes RSP from R12
-                sink.put1(encode_modrm(2, enc_g & 7, 4));
-                sink.put1(0x24);
-                sink.put4(*simm32);
-            } else {
-                unreachable!("ImmReg");
-            }
-        }
-
-        Amode::ImmRegRegShift {
-            simm32,
-            base: reg_base,
-            index: reg_index,
-            shift,
-            ..
-        } => {
-            // If this is an access based off of RSP, it may trap with a stack overflow if it's the
-            // first touch of a new stack page.
-            if *reg_base == regs::rsp() && !can_trap && info.flags().enable_probestack() {
-                sink.add_trap(srcloc, TrapCode::StackOverflow);
-            }
-
-            let enc_base = int_reg_enc(*reg_base);
-            let enc_index = int_reg_enc(*reg_index);
-
-            // The rex byte.
-            rex.emit_three_op(sink, enc_g, enc_index, enc_base);
-
-            // All other prefixes and opcodes.
-            while num_opcodes > 0 {
-                num_opcodes -= 1;
-                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
-            }
-
-            // modrm, SIB, immediates.
-            if low8_will_sign_extend_to_32(*simm32) && enc_index != regs::ENC_RSP {
-                sink.put1(encode_modrm(1, enc_g & 7, 4));
-                sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7));
-                sink.put1(*simm32 as u8);
-            } else if enc_index != regs::ENC_RSP {
-                sink.put1(encode_modrm(2, enc_g & 7, 4));
-                sink.put1(encode_sib(*shift, enc_index & 7, enc_base & 7));
-                sink.put4(*simm32);
-            } else {
-                panic!("ImmRegRegShift");
-            }
-        }
-
-        Amode::RipRelative { ref target } => {
-            // First, the REX byte, with REX.B = 0.
-            rex.emit_two_op(sink, enc_g, 0);
-
-            // Now the opcode(s).  These include any other prefixes the caller
-            // hands to us.
-            while num_opcodes > 0 {
-                num_opcodes -= 1;
-                sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
-            }
-
-            // RIP-relative is mod=00, rm=101.
-            sink.put1(encode_modrm(0, enc_g & 7, 0b101));
-
-            let offset = sink.cur_offset();
-            sink.use_label_at_offset(offset, *target, LabelUse::JmpRel32);
-            sink.put4(0);
-        }
-    }
-}
-
-/// This is the core 'emit' function for instructions that do not reference memory.
-///
-/// This is conceptually the same as emit_modrm_sib_enc_ge, except it is for the case where the E
-/// operand is a register rather than memory.  Hence it is much simpler.
-fn emit_std_enc_enc(
-    sink: &mut MachBuffer<Inst>,
-    prefixes: LegacyPrefixes,
-    opcodes: u32,
-    mut num_opcodes: usize,
-    enc_g: u8,
-    enc_e: u8,
-    rex: RexFlags,
-) {
-    // EncG and EncE can be derived from registers of any class, and they
-    // don't even have to be from the same class.  For example, for an
-    // integer-to-FP conversion insn, one might be RegClass::I64 and the other
-    // RegClass::V128.
-
-    // The legacy prefixes.
-    prefixes.emit(sink);
-
-    // The rex byte.
-    rex.emit_two_op(sink, enc_g, enc_e);
-
-    // All other prefixes and opcodes.
-    while num_opcodes > 0 {
-        num_opcodes -= 1;
-        sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8);
-    }
-
-    // Now the mod/rm byte.  The instruction we're generating doesn't access
-    // memory, so there is no SIB byte or immediate -- we're done.
-    sink.put1(encode_modrm(3, enc_g & 7, enc_e & 7));
-}
-
-// These are merely wrappers for the above two functions that facilitate passing
-// actual `Reg`s rather than their encodings.
-
-fn emit_std_reg_mem(
-    sink: &mut MachBuffer<Inst>,
-    state: &EmitState,
-    info: &EmitInfo,
-    prefixes: LegacyPrefixes,
-    opcodes: u32,
-    num_opcodes: usize,
-    reg_g: Reg,
-    mem_e: &Amode,
-    rex: RexFlags,
-) {
-    let enc_g = reg_enc(reg_g);
-    emit_std_enc_mem(
-        sink,
-        state,
-        info,
-        prefixes,
-        opcodes,
-        num_opcodes,
-        enc_g,
-        mem_e,
-        rex,
-    );
-}
-
-fn emit_std_reg_reg(
-    sink: &mut MachBuffer<Inst>,
-    prefixes: LegacyPrefixes,
-    opcodes: u32,
-    num_opcodes: usize,
-    reg_g: Reg,
-    reg_e: Reg,
-    rex: RexFlags,
-) {
-    let enc_g = reg_enc(reg_g);
-    let enc_e = reg_enc(reg_e);
-    emit_std_enc_enc(sink, prefixes, opcodes, num_opcodes, enc_g, enc_e, rex);
-}
-
-/// Write a suitable number of bits from an imm64 to the sink.
-fn emit_simm(sink: &mut MachBuffer<Inst>, size: u8, simm32: u32) {
-    match size {
-        8 | 4 => sink.put4(simm32),
-        2 => sink.put2(simm32 as u16),
-        1 => sink.put1(simm32 as u8),
-        _ => unreachable!(),
-    }
-}
+use regalloc::{Reg, Writable};

 /// A small helper to generate a signed conversion instruction.
 fn emit_signed_cvt(
@@ -546,18 +115,30 @@ pub(crate) fn emit(
    info: &EmitInfo,
    state: &mut EmitState,
 ) {
-    if let Some(iset_requirement) = inst.isa_requirement() {
+    let matches_isa_flags = |iset_requirement: &InstructionSet| -> bool {
        match iset_requirement {
            // Cranelift assumes SSE2 at least.
-            InstructionSet::SSE | InstructionSet::SSE2 => {}
-            InstructionSet::SSSE3 => assert!(info.isa_flags.use_ssse3()),
-            InstructionSet::SSE41 => assert!(info.isa_flags.use_sse41()),
-            InstructionSet::SSE42 => assert!(info.isa_flags.use_sse42()),
-            InstructionSet::Popcnt => assert!(info.isa_flags.use_popcnt()),
-            InstructionSet::Lzcnt => assert!(info.isa_flags.use_lzcnt()),
-            InstructionSet::BMI1 => assert!(info.isa_flags.use_bmi1()),
-            InstructionSet::BMI2 => assert!(info.isa_flags.has_bmi2()),
+            InstructionSet::SSE | InstructionSet::SSE2 => true,
+            InstructionSet::SSSE3 => info.isa_flags.use_ssse3(),
+            InstructionSet::SSE41 => info.isa_flags.use_sse41(),
+            InstructionSet::SSE42 => info.isa_flags.use_sse42(),
+            InstructionSet::Popcnt => info.isa_flags.use_popcnt(),
+            InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(),
+            InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
+            InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
+            InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
+            InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
        }
+    };
+
+    // Certain instructions may be present in more than one ISA feature set; we must at least match
+    // one of them in the target CPU.
+    let isa_requirements = inst.available_in_any_isa();
+    if !isa_requirements.is_empty() && !isa_requirements.iter().any(matches_isa_flags) {
+        panic!(
+            "Cannot emit inst '{:?}' for target; failed to match ISA requirements: {:?}",
+            inst, isa_requirements
+        )
    }

    match inst {
@@ -887,7 +468,6 @@ pub(crate) fn emit(
            // idiv %divisor
            //
            // $done:
-            debug_assert!(info.flags().avoid_div_traps());

            // Check if the divisor is zero, first.
            let inst = Inst::cmp_rmi_r(*size, RegMemImm::imm(0), divisor.to_reg());
@@ -911,7 +491,7 @@ pub(crate) fn emit(
                    // x % -1 = 0; put the result into the destination, $rdx.
                    let done_label = sink.get_label();

-                    let inst = Inst::imm(*size, 0, Writable::from_reg(regs::rdx()));
+                    let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(regs::rdx()));
                    inst.emit(sink, info, state);

                    let inst = Inst::jmp_known(done_label);
@@ -951,11 +531,6 @@ pub(crate) fn emit(
                sink.bind_label(do_op);
            }

-            assert!(
-                *size != OperandSize::Size8,
-                "CheckedDivOrRemSeq for i8 is not yet implemented"
-            );
-
            // Fill in the high parts:
            if kind.is_signed() {
                // sign-extend the sign-bit of rax into rdx, for signed opcodes.
@@ -1769,6 +1344,7 @@ pub(crate) fn emit(
            let rex = RexFlags::clear_w();

            let (prefix, opcode, num_opcodes) = match op {
+                SseOpcode::Cvtdq2pd => (LegacyPrefixes::_F3, 0x0FE6, 2),
                SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2),
                SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2),
                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2),
@@ -1830,6 +1406,24 @@ pub(crate) fn emit(
            };
        }

+        Inst::XmmUnaryRmREvex { op, src, dst } => {
+            let opcode = match op {
+                Avx512Opcode::Vpabsq => 0x1f,
+            };
+            match src {
+                RegMem::Reg { reg: src } => EvexInstruction::new()
+                    .length(EvexVectorLength::V128)
+                    .prefix(LegacyPrefixes::_66)
+                    .map(OpcodeMap::_0F38)
+                    .w(true)
+                    .opcode(opcode)
+                    .reg(dst.to_reg().get_hw_encoding())
+                    .rm(src.get_hw_encoding())
+                    .encode(sink),
+                _ => todo!(),
+            };
+        }
+
        Inst::XmmRmR {
            op,
            src: src_e,
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3859,6 +3859,18 @@ fn test_x64_emit() {
        "pabsd   %xmm10, %xmm11",
    ));

+    insns.push((
+        Inst::xmm_unary_rm_r(SseOpcode::Cvtdq2pd, RegMem::reg(xmm2), w_xmm8),
+        "F3440FE6C2",
+        "cvtdq2pd %xmm2, %xmm8",
+    ));
+
+    insns.push((
+        Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, RegMem::reg(xmm2), w_xmm8),
+        "6272FD081FC2",
+        "vpabsq  %xmm2, %xmm8",
+    ));
+
    // Xmm to int conversions, and conversely.

    insns.push((
@@ -4270,6 +4282,7 @@ fn test_x64_emit() {
    let mut isa_flag_builder = x64::settings::builder();
    isa_flag_builder.enable("has_ssse3").unwrap();
    isa_flag_builder.enable("has_sse41").unwrap();
+    isa_flag_builder.enable("has_avx512f").unwrap();
    let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);

    let rru = regs::create_reg_universe_systemv(&flags);
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -14,7 +14,7 @@ use regalloc::{
    PrettyPrint, PrettyPrintSized, RealRegUniverse, Reg, RegClass, RegUsageCollector,
    RegUsageMapper, SpillSlot, VirtualReg, Writable,
 };
-use smallvec::SmallVec;
+use smallvec::{smallvec, SmallVec};
 use std::fmt;
 use std::string::{String, ToString};

@@ -224,6 +224,12 @@ pub enum Inst {
        dst: Writable<Reg>,
    },

+    XmmUnaryRmREvex {
+        op: Avx512Opcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+    },
+
    /// XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, movq
    XmmMovRM {
        op: SseOpcode,
@@ -501,7 +507,11 @@ pub(crate) fn low32_will_sign_extend_to_64(x: u64) -> bool {
 }

 impl Inst {
-    fn isa_requirement(&self) -> Option<InstructionSet> {
+    /// Retrieve a list of ISA feature sets in which the instruction is available. An empty list
+    /// indicates that the instruction is available in the baseline feature set (i.e. SSE2 and
+    /// below); more than one `InstructionSet` in the list indicates that the instruction is present
+    /// *any* of the included ISA feature sets.
+    fn available_in_any_isa(&self) -> SmallVec<[InstructionSet; 2]> {
        match self {
            // These instructions are part of SSE2, which is a basic requirement in Cranelift, and
            // don't have to be checked.
@@ -554,7 +564,7 @@ impl Inst {
            | Inst::ElfTlsGetAddr { .. }
            | Inst::MachOTlsGetAddr { .. }
            | Inst::ValueLabelMarker { .. }
-            | Inst::Unwind { .. } => None,
+            | Inst::Unwind { .. } => smallvec![],

            Inst::UnaryRmR { op, .. } => op.available_from(),

@@ -565,7 +575,9 @@ impl Inst {
            | Inst::XmmRmR { op, .. }
            | Inst::XmmRmRImm { op, .. }
            | Inst::XmmToGpr { op, .. }
-            | Inst::XmmUnaryRmR { op, .. } => Some(op.available_from()),
+            | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
+
+            Inst::XmmUnaryRmREvex { op, .. } => op.available_from(),
        }
    }
 }
@@ -700,6 +712,12 @@ impl Inst {
        Inst::XmmUnaryRmR { op, src, dst }
    }

+    pub(crate) fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable<Reg>) -> Inst {
+        src.assert_regclass_is(RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmUnaryRmREvex { op, src, dst }
+    }
+
    pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
        src.assert_regclass_is(RegClass::V128);
        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
@@ -1121,11 +1139,7 @@ impl Inst {
    pub(crate) fn store(ty: Type, from_reg: Reg, to_addr: impl Into<SyntheticAmode>) -> Inst {
        let rc = from_reg.get_class();
        match rc {
-            RegClass::I64 => {
-                // Always store the full register, to ensure that the high bits are properly set
-                // when doing a full reload.
-                Inst::mov_r_m(OperandSize::Size64, from_reg, to_addr)
-            }
+            RegClass::I64 => Inst::mov_r_m(OperandSize::from_ty(ty), from_reg, to_addr),
            RegClass::V128 => {
                let opcode = match ty {
                    types::F32 => SseOpcode::Movss,
@@ -1390,6 +1404,13 @@ impl PrettyPrint for Inst {
                show_ireg_sized(dst.to_reg(), mb_rru, 8),
            ),

+            Inst::XmmUnaryRmREvex { op, src, dst, .. } => format!(
+                "{} {}, {}",
+                ljustify(op.to_string()),
+                src.show_rru_sized(mb_rru, 8),
+                show_ireg_sized(dst.to_reg(), mb_rru, 8),
+            ),
+
            Inst::XmmMovRM { op, src, dst, .. } => format!(
                "{} {}, {}",
                ljustify(op.to_string()),
@@ -1862,7 +1883,9 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
                collector.add_def(Writable::from_reg(regs::rdx()));
            }
        },
-        Inst::UnaryRmR { src, dst, .. } | Inst::XmmUnaryRmR { src, dst, .. } => {
+        Inst::UnaryRmR { src, dst, .. }
+        | Inst::XmmUnaryRmR { src, dst, .. }
+        | Inst::XmmUnaryRmREvex { src, dst, .. } => {
            src.get_regs_as_uses(collector);
            collector.add_def(*dst);
        }
@@ -2209,6 +2232,11 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            ref mut dst,
            ..
        }
+        | Inst::XmmUnaryRmREvex {
+            ref mut src,
+            ref mut dst,
+            ..
+        }
        | Inst::UnaryRmR {
            ref mut src,
            ref mut dst,
@@ -2827,7 +2855,7 @@ impl EmitState {
        self.stack_map = None;
    }

-    fn cur_srcloc(&self) -> SourceLoc {
+    pub(crate) fn cur_srcloc(&self) -> SourceLoc {
        self.cur_srcloc
    }
 }
--- a/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/x64/inst/unwind/systemv.rs
@@ -89,8 +89,8 @@ impl crate::isa::unwind::systemv::RegisterMapper<Reg> for RegisterMapper {
    fn sp(&self) -> u16 {
        X86_64::RSP.0
    }
-    fn fp(&self) -> u16 {
-        X86_64::RBP.0
+    fn fp(&self) -> Option<u16> {
+        Some(X86_64::RBP.0)
    }
 }

@@ -109,6 +109,7 @@ mod tests {
    use target_lexicon::triple;

    #[test]
+    #[cfg_attr(feature = "old-x86-backend", ignore)]
    fn test_simple_func() {
        let isa = lookup(triple!("x86_64"))
            .expect("expect x86 ISA")
@@ -151,6 +152,7 @@ mod tests {
    }

    #[test]
+    #[cfg_attr(feature = "old-x86-backend", ignore)]
    fn test_multi_return_func() {
        let isa = lookup(triple!("x86_64"))
            .expect("expect x86 ISA")
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -204,6 +204,7 @@ enum ExtSpec {
    ZeroExtendTo32,
    ZeroExtendTo64,
    SignExtendTo32,
+    #[allow(dead_code)] // not used just yet but may be used in the future!
    SignExtendTo64,
 }

@@ -1854,10 +1855,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let ty = ty.unwrap();
            if ty == types::I64X2 {
-                // This lowering could be a single instruction with AVX512F/VL's VPABSQ instruction.
-                // Instead, we use a separate register, `tmp`, to contain the results of `0 - src`
-                // and then blend in those results with `BLENDVPD` if the MSB of `tmp` was set to 1
-                // (i.e. if `tmp` was negative or, conversely, if `src` was originally positive).
+                if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
+                    ctx.emit(Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, src, dst));
+                } else {
+                    // If `VPABSQ` from AVX512 is unavailable, we use a separate register, `tmp`, to
+                    // contain the results of `0 - src` and then blend in those results with
+                    // `BLENDVPD` if the MSB of `tmp` was set to 1 (i.e. if `tmp` was negative or,
+                    // conversely, if `src` was originally positive).

                    // Emit all 0s into the `tmp` register.
                    let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
@@ -1873,6 +1877,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        ty,
                    ));
                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst));
+                }
            } else if ty.is_vector() {
                let opcode = match ty {
                    types::I8X16 => SseOpcode::Pabsb,
@@ -2041,7 +2046,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                }
                ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
            } else if dst_ty == types::I128 {
-                let amt_src = put_input_in_reg(ctx, inputs[1]);
+                let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
                let src = put_input_in_regs(ctx, inputs[0]);
                let dst = get_output_reg(ctx, outputs[0]);

@@ -3914,7 +3919,15 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::xmm_rm_r(opcode, RegMem::from(dst), dst));
            }
        }
-
+        Opcode::FcvtLowFromSint => {
+            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            ctx.emit(Inst::xmm_unary_rm_r(
+                SseOpcode::Cvtdq2pd,
+                RegMem::from(src),
+                dst,
+            ));
+        }
        Opcode::FcvtFromUint => {
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let ty = ty.unwrap();
@@ -4813,28 +4826,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

            if elem_ty == types::I128 {
                let srcs = put_input_in_regs(ctx, inputs[0]);
-                ctx.emit(Inst::mov_r_m(
-                    OperandSize::Size64,
-                    srcs.regs()[0],
-                    addr.clone(),
-                ));
-                ctx.emit(Inst::mov_r_m(
-                    OperandSize::Size64,
-                    srcs.regs()[1],
-                    addr.offset(8),
-                ));
+                ctx.emit(Inst::store(types::I64, srcs.regs()[0], addr.clone()));
+                ctx.emit(Inst::store(types::I64, srcs.regs()[1], addr.offset(8)));
            } else {
                let src = put_input_in_reg(ctx, inputs[0]);
-
-                ctx.emit(match elem_ty {
-                    types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr),
-                    types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr),
-                    _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
-                        // TODO Specialize for different types: MOVUPD, MOVDQU, etc.
-                        Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr)
-                    }
-                    _ => Inst::mov_r_m(OperandSize::from_ty(elem_ty), src, addr),
-                });
+                ctx.emit(Inst::store(elem_ty, src, addr));
            }
        }

@@ -4938,7 +4934,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let ty_access = ctx.input_ty(insn, 0);
            assert!(is_valid_atomic_transaction_ty(ty_access));

-            ctx.emit(Inst::mov_r_m(OperandSize::from_ty(ty_access), data, addr));
+            ctx.emit(Inst::store(ty_access, data, addr));
            ctx.emit(Inst::Fence {
                kind: FenceKind::MFence,
            });
@@ -5181,7 +5177,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                input_ty,
            ));

-            if flags.avoid_div_traps() {
+            // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
+            if flags.avoid_div_traps() || op == Opcode::Srem {
                // A vcode meta-instruction is used to lower the inline checks, since they embed
                // pc-relative offsets that must not change, thus requiring regalloc to not
                // interfere by introducing spills and reloads.
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -9,7 +9,7 @@ use crate::isa::Builder as IsaBuilder;
 use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
 use crate::result::CodegenResult;
 use crate::settings::{self as shared_settings, Flags};
-use alloc::boxed::Box;
+use alloc::{boxed::Box, vec::Vec};
 use core::hash::{Hash, Hasher};
 use regalloc::{PrettyPrint, RealRegUniverse, Reg};
 use target_lexicon::Triple;
@@ -18,6 +18,7 @@ use target_lexicon::Triple;
 use crate::isa::unwind::systemv;

 mod abi;
+pub mod encoding;
 mod inst;
 mod lower;
 mod settings;
@@ -85,6 +86,10 @@ impl MachBackend for X64Backend {
        &self.flags
    }

+    fn isa_flags(&self) -> Vec<shared_settings::Value> {
+        self.x64_flags.iter().collect()
+    }
+
    fn hash_all_flags(&self, mut hasher: &mut dyn Hasher) {
        self.flags.hash(&mut hasher);
        self.x64_flags.hash(&mut hasher);
--- a/cranelift/codegen/src/isa/x64/settings.rs
+++ b/cranelift/codegen/src/isa/x64/settings.rs
@@ -1,6 +1,6 @@
 //! x86 Settings.

-use crate::settings::{self, detail, Builder};
+use crate::settings::{self, detail, Builder, Value};
 use core::fmt;

 // Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
--- a/cranelift/codegen/src/isa/x86/abi.rs
+++ b/cranelift/codegen/src/isa/x86/abi.rs
@@ -503,15 +503,18 @@ fn callee_saved_regs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterS
 pub fn prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> CodegenResult<()> {
    match func.signature.call_conv {
        // For now, just translate fast and cold as system_v.
-        CallConv::Fast | CallConv::Cold | CallConv::SystemV => {
+        CallConv::Fast | CallConv::Cold | CallConv::SystemV | CallConv::WasmtimeSystemV => {
            system_v_prologue_epilogue(func, isa)
        }
-        CallConv::WindowsFastcall => fastcall_prologue_epilogue(func, isa),
+        CallConv::WindowsFastcall | CallConv::WasmtimeFastcall => {
+            fastcall_prologue_epilogue(func, isa)
+        }
        CallConv::BaldrdashSystemV | CallConv::BaldrdashWindows => {
            baldrdash_prologue_epilogue(func, isa)
        }
        CallConv::Probestack => unimplemented!("probestack calling convention"),
        CallConv::Baldrdash2020 => unimplemented!("Baldrdash ABI 2020"),
+        CallConv::AppleAarch64 => unreachable!(),
    }
 }

@@ -1083,16 +1086,17 @@ pub fn create_unwind_info(
    isa: &dyn TargetIsa,
 ) -> CodegenResult<Option<crate::isa::unwind::UnwindInfo>> {
    use crate::isa::unwind::UnwindInfo;
+    use crate::machinst::UnwindInfoKind;

    // Assumption: RBP is being used as the frame pointer for both calling conventions
    // In the future, we should be omitting frame pointer as an optimization, so this will change
-    Ok(match func.signature.call_conv {
-        CallConv::Fast | CallConv::Cold | CallConv::SystemV => {
+    Ok(match isa.unwind_info_kind() {
+        UnwindInfoKind::SystemV => {
            super::unwind::systemv::create_unwind_info(func, isa)?.map(|u| UnwindInfo::SystemV(u))
        }
-        CallConv::WindowsFastcall => {
+        UnwindInfoKind::Windows => {
            super::unwind::winx64::create_unwind_info(func, isa)?.map(|u| UnwindInfo::WindowsX64(u))
        }
-        _ => None,
+        UnwindInfoKind::None => None,
    })
 }
--- a/cranelift/codegen/src/isa/x86/mod.rs
+++ b/cranelift/codegen/src/isa/x86/mod.rs
@@ -21,8 +21,7 @@ use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
 use crate::regalloc;
 use crate::result::CodegenResult;
 use crate::timing;
-use alloc::borrow::Cow;
-use alloc::boxed::Box;
+use alloc::{borrow::Cow, boxed::Box, vec::Vec};
 use core::any::Any;
 use core::fmt;
 use core::hash::{Hash, Hasher};
@@ -79,6 +78,10 @@ impl TargetIsa for Isa {
        &self.shared_flags
    }

+    fn isa_flags(&self) -> Vec<shared_settings::Value> {
+        self.isa_flags.iter().collect()
+    }
+
    fn hash_all_flags(&self, mut hasher: &mut dyn Hasher) {
        self.shared_flags.hash(&mut hasher);
        self.isa_flags.hash(&mut hasher);
--- a/cranelift/codegen/src/isa/x86/settings.rs
+++ b/cranelift/codegen/src/isa/x86/settings.rs
@@ -1,6 +1,6 @@
 //! x86 Settings.

-use crate::settings::{self, detail, Builder};
+use crate::settings::{self, detail, Builder, Value};
 use core::fmt;

 // Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
--- a/cranelift/codegen/src/isa/x86/unwind/systemv.rs
+++ b/cranelift/codegen/src/isa/x86/unwind/systemv.rs
@@ -3,7 +3,7 @@
 use crate::ir::Function;
 use crate::isa::{
    unwind::systemv::{RegisterMappingError, UnwindInfo},
-    CallConv, RegUnit, TargetIsa,
+    RegUnit, TargetIsa,
 };
 use crate::result::CodegenResult;
 use gimli::{write::CommonInformationEntry, Encoding, Format, Register, X86_64};
@@ -97,8 +97,8 @@ pub(crate) fn create_unwind_info(
    isa: &dyn TargetIsa,
 ) -> CodegenResult<Option<UnwindInfo>> {
    // Only System V-like calling conventions are supported
-    match func.signature.call_conv {
-        CallConv::Fast | CallConv::Cold | CallConv::SystemV => {}
+    match isa.unwind_info_kind() {
+        crate::machinst::UnwindInfoKind::SystemV => {}
        _ => return Ok(None),
    }

@@ -121,8 +121,8 @@ pub(crate) fn create_unwind_info(
        fn sp(&self) -> u16 {
            X86_64::RSP.0
        }
-        fn fp(&self) -> u16 {
-            X86_64::RBP.0
+        fn fp(&self) -> Option<u16> {
+            Some(X86_64::RBP.0)
        }
    }
    let map = RegisterMapper(isa);
--- a/cranelift/codegen/src/isa/x86/unwind/winx64.rs
+++ b/cranelift/codegen/src/isa/x86/unwind/winx64.rs
@@ -2,7 +2,7 @@

 use crate::ir::Function;
 use crate::isa::x86::registers::{FPR, GPR};
-use crate::isa::{unwind::winx64::UnwindInfo, CallConv, RegUnit, TargetIsa};
+use crate::isa::{unwind::winx64::UnwindInfo, RegUnit, TargetIsa};
 use crate::result::CodegenResult;

 pub(crate) fn create_unwind_info(
@@ -10,7 +10,7 @@ pub(crate) fn create_unwind_info(
    isa: &dyn TargetIsa,
 ) -> CodegenResult<Option<UnwindInfo>> {
    // Only Windows fastcall is supported for unwind information
-    if func.signature.call_conv != CallConv::WindowsFastcall || func.prologue_end.is_none() {
+    if !func.signature.call_conv.extends_windows_fastcall() || func.prologue_end.is_none() {
        return Ok(None);
    }

--- a/cranelift/codegen/src/lib.rs
+++ b/cranelift/codegen/src/lib.rs
@@ -97,6 +97,7 @@ mod inst_predicates;
 mod iterators;
 mod legalizer;
 mod licm;
+mod log;
 mod nan_canonicalization;
 mod partition_slice;
 mod postopt;
--- a/cranelift/codegen/src/log.rs
+++ b/cranelift/codegen/src/log.rs
@@ -0,0 +1,39 @@
+//! This module implements deferred display helpers.
+//!
+//! These are particularly useful in logging contexts, where the maximum logging level filter might
+//! be enabled, but we don't want the arguments to be evaluated early:
+//!
+//! ```
+//! log::set_max_level(log::LevelFilter::max());
+//! fn expensive_calculation() -> String {
+//!   "a string that is very slow to generate".into()
+//! }
+//! log::debug!("{}", expensive_calculation());
+//! ```
+//!
+//! If the associated log implementation filters out log debug entries, the expensive calculation
+//! would have been spurious. In this case, we can wrap the expensive computation within an
+//! `DeferredDisplay`, so that the computation only happens when the actual `fmt` function is
+//! called.
+
+use core::fmt;
+
+pub(crate) struct DeferredDisplay<F>(F);
+
+impl<F: Fn() -> T, T: fmt::Display> DeferredDisplay<F> {
+    pub(crate) fn new(f: F) -> Self {
+        Self(f)
+    }
+}
+
+impl<F: Fn() -> T, T: fmt::Display> fmt::Display for DeferredDisplay<F> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0().fmt(f)
+    }
+}
+
+impl<F: Fn() -> T, T: fmt::Debug> fmt::Debug for DeferredDisplay<F> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0().fmt(f)
+    }
+}
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -30,6 +30,12 @@ pub trait ABICallee {
    /// Access the (possibly legalized) signature.
    fn signature(&self) -> &Signature;

+    /// Accumulate outgoing arguments.  This ensures that at least SIZE bytes
+    /// are allocated in the prologue to be available for use in function calls
+    /// to hold arguments and/or return values.  If this function is called
+    /// multiple times, the maximum of all SIZE values will be available.
+    fn accumulate_outgoing_args_size(&mut self, size: u32);
+
    /// Get the settings controlling this function's compilation.
    fn flags(&self) -> &settings::Flags;

@@ -189,9 +195,6 @@ pub trait ABICallee {
        from_slot: SpillSlot,
        ty: Option<Type>,
    ) -> Self::I;
-
-    /// Desired unwind info type.
-    fn unwind_info_kind(&self) -> UnwindInfoKind;
 }

 /// Trait implemented by an object that tracks ABI-related state and can
@@ -245,6 +248,13 @@ pub trait ABICaller {
    /// Emit code to post-adjust the satck, after call return and return-value copies.
    fn emit_stack_post_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C);

+    /// Accumulate outgoing arguments.  This ensures that the caller (as
+    /// identified via the CTX argument) allocates enough space in the
+    /// prologue to hold all arguments and return values for this call.
+    /// There is no code emitted at the call site, everything is done
+    /// in the caller's function prologue.
+    fn accumulate_outgoing_args_size<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C);
+
    /// Emit the call itself.
    ///
    /// The returned instruction should have proper use- and def-sets according
--- a/cranelift/codegen/src/machinst/abi_impl.rs
+++ b/cranelift/codegen/src/machinst/abi_impl.rs
@@ -102,7 +102,7 @@
 //! support the SpiderMonkey Wasm ABI.  For details of the multi-value return
 //! ABI, see:
 //!
-//! https://searchfox.org/mozilla-central/rev/bc3600def806859c31b2c7ac06e3d69271052a89/js/src/wasm/WasmStubs.h#134
+//! <https://searchfox.org/mozilla-central/rev/bc3600def806859c31b2c7ac06e3d69271052a89/js/src/wasm/WasmStubs.h#134>
 //!
 //! In brief:
 //! - Return values are processed in *reverse* order.
@@ -444,6 +444,7 @@ pub trait ABIMachineSpec {
        flags: &settings::Flags,
        clobbers: &Set<Writable<RealReg>>,
        fixed_frame_storage_size: u32,
+        outgoing_args_size: u32,
    ) -> (u64, SmallVec<[Self::I; 16]>);

    /// Generate a clobber-restore sequence. This sequence should perform the
@@ -455,6 +456,7 @@ pub trait ABIMachineSpec {
        flags: &settings::Flags,
        clobbers: &Set<Writable<RealReg>>,
        fixed_frame_storage_size: u32,
+        outgoing_args_size: u32,
    ) -> SmallVec<[Self::I; 16]>;

    /// Generate a call instruction/sequence. This method is provided one
@@ -576,6 +578,8 @@ pub struct ABICalleeImpl<M: ABIMachineSpec> {
    stackslots: PrimaryMap<StackSlot, u32>,
    /// Total stack size of all stackslots.
    stackslots_size: u32,
+    /// Stack size to be reserved for outgoing arguments.
+    outgoing_args_size: u32,
    /// Clobbered registers, from regalloc.
    clobbered: Set<Writable<RealReg>>,
    /// Total number of spillslots, from regalloc.
@@ -646,7 +650,9 @@ impl<M: ABIMachineSpec> ABICalleeImpl<M> {
                || call_conv == isa::CallConv::Fast
                || call_conv == isa::CallConv::Cold
                || call_conv.extends_baldrdash()
-                || call_conv.extends_windows_fastcall(),
+                || call_conv.extends_windows_fastcall()
+                || call_conv == isa::CallConv::AppleAarch64
+                || call_conv == isa::CallConv::WasmtimeSystemV,
            "Unsupported calling convention: {:?}",
            call_conv
        );
@@ -689,6 +695,7 @@ impl<M: ABIMachineSpec> ABICalleeImpl<M> {
            sig,
            stackslots,
            stackslots_size: stack_offset,
+            outgoing_args_size: 0,
            clobbered: Set::empty(),
            spillslots: None,
            fixed_frame_storage_size: 0,
@@ -915,6 +922,12 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        }
    }

+    fn accumulate_outgoing_args_size(&mut self, size: u32) {
+        if size > self.outgoing_args_size {
+            self.outgoing_args_size = size;
+        }
+    }
+
    fn flags(&self) -> &settings::Flags {
        &self.flags
    }
@@ -1196,6 +1209,15 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        let spill_off = islot * M::word_bytes() as i64;
        let sp_off = self.stackslots_size as i64 + spill_off;
        trace!("load_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
+
+        // Integer types smaller than word size have been spilled as words below,
+        // and therefore must be reloaded in the same type.
+        let ty = if ty.is_int() && ty.bytes() < M::word_bytes() {
+            M::word_type()
+        } else {
+            ty
+        };
+
        gen_load_stack_multi::<M>(StackAMode::NominalSPOffset(sp_off, ty), into_regs, ty)
    }

@@ -1211,6 +1233,19 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        let spill_off = islot * M::word_bytes() as i64;
        let sp_off = self.stackslots_size as i64 + spill_off;
        trace!("store_spillslot: slot {:?} -> sp_off {}", slot, sp_off);
+
+        // When reloading from a spill slot, we might have lost information about real integer
+        // types. For instance, on the x64 backend, a zero-extension can become spurious and
+        // optimized into a move, causing vregs of types I32 and I64 to share the same coalescing
+        // equivalency class. As a matter of fact, such a value can be spilled as an I32 and later
+        // reloaded as an I64; to make sure the high bits are always defined, do a word-sized store
+        // all the time, in this case.
+        let ty = if ty.is_int() && ty.bytes() < M::word_bytes() {
+            M::word_type()
+        } else {
+            ty
+        };
+
        gen_store_stack_multi::<M>(StackAMode::NominalSPOffset(sp_off, ty), from_regs, ty)
    }

@@ -1283,11 +1318,12 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        }

        // Save clobbered registers.
-        let (_, clobber_insts) = M::gen_clobber_save(
+        let (clobber_size, clobber_insts) = M::gen_clobber_save(
            self.call_conv,
            &self.flags,
            &self.clobbered,
            self.fixed_frame_storage_size,
+            self.outgoing_args_size,
        );
        insts.extend(clobber_insts);

@@ -1302,7 +1338,7 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        // [crate::machinst::abi_impl](this module) for more details
        // on stackframe layout and nominal SP maintenance.

-        self.total_frame_size = Some(total_stacksize);
+        self.total_frame_size = Some(total_stacksize + clobber_size as u32);
        insts
    }

@@ -1315,6 +1351,7 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
            &self.flags,
            &self.clobbered,
            self.fixed_frame_storage_size,
+            self.outgoing_args_size,
        ));

        // N.B.: we do *not* emit a nominal SP adjustment here, because (i) there will be no
@@ -1369,18 +1406,6 @@ impl<M: ABIMachineSpec> ABICallee for ABICalleeImpl<M> {
        .next()
        .unwrap()
    }
-
-    fn unwind_info_kind(&self) -> UnwindInfoKind {
-        match self.sig.call_conv {
-            #[cfg(feature = "unwind")]
-            isa::CallConv::Fast | isa::CallConv::Cold | isa::CallConv::SystemV => {
-                UnwindInfoKind::SystemV
-            }
-            #[cfg(feature = "unwind")]
-            isa::CallConv::WindowsFastcall => UnwindInfoKind::Windows,
-            _ => UnwindInfoKind::None,
-        }
-    }
 }

 fn abisig_to_uses_and_defs<M: ABIMachineSpec>(sig: &ABISig) -> (Vec<Reg>, Vec<Writable<Reg>>) {
@@ -1529,6 +1554,11 @@ impl<M: ABIMachineSpec> ABICaller for ABICallerImpl<M> {
        }
    }

+    fn accumulate_outgoing_args_size<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) {
+        let off = self.sig.stack_arg_space + self.sig.stack_ret_space;
+        ctx.abi().accumulate_outgoing_args_size(off as u32);
+    }
+
    fn emit_stack_pre_adjust<C: LowerCtx<I = Self::I>>(&self, ctx: &mut C) {
        let off = self.sig.stack_arg_space + self.sig.stack_ret_space;
        adjust_stack_and_nominal_sp::<M, C>(ctx, off as i32, /* is_sub = */ true)
--- a/cranelift/codegen/src/machinst/adapter.rs
+++ b/cranelift/codegen/src/machinst/adapter.rs
@@ -2,10 +2,12 @@

 use crate::binemit;
 use crate::ir;
-use crate::isa::{EncInfo, Encoding, Encodings, Legalize, RegClass, RegInfo, TargetIsa};
+use crate::isa::{
+    BackendVariant, EncInfo, Encoding, Encodings, Legalize, RegClass, RegInfo, TargetIsa,
+};
 use crate::machinst::*;
 use crate::regalloc::RegisterSet;
-use crate::settings::Flags;
+use crate::settings::{self, Flags};

 #[cfg(feature = "testing_hooks")]
 use crate::regalloc::RegDiversions;
@@ -14,7 +16,6 @@ use crate::regalloc::RegDiversions;
 use crate::isa::unwind::systemv::RegisterMappingError;

 use core::any::Any;
-use core::hash::Hasher;
 use std::borrow::Cow;
 use std::fmt;
 use target_lexicon::Triple;
@@ -59,8 +60,16 @@ impl TargetIsa for TargetIsaAdapter {
        self.backend.flags()
    }

+    fn isa_flags(&self) -> Vec<settings::Value> {
+        self.backend.isa_flags()
+    }
+
+    fn variant(&self) -> BackendVariant {
+        BackendVariant::MachInst
+    }
+
    fn hash_all_flags(&self, hasher: &mut dyn Hasher) {
-        self.backend.hash_all_flags(hasher)
+        self.backend.hash_all_flags(hasher);
    }

    fn register_info(&self) -> RegInfo {
--- a/cranelift/codegen/src/machinst/compile.rs
+++ b/cranelift/codegen/src/machinst/compile.rs
@@ -1,6 +1,7 @@
 //! Compilation backend pipeline: optimized IR to VCode / binemit.

 use crate::ir::Function;
+use crate::log::DeferredDisplay;
 use crate::machinst::*;
 use crate::settings;
 use crate::timing;
@@ -29,9 +30,11 @@ where
        lower.lower(b)?
    };

+    // Creating the vcode string representation may be costly for large functions, so defer its
+    // rendering.
    debug!(
        "vcode from lowering: \n{}",
-        vcode.show_rru(Some(b.reg_universe()))
+        DeferredDisplay::new(|| vcode.show_rru(Some(b.reg_universe())))
    );

    // Perform register allocation.
@@ -103,7 +106,7 @@ where

    debug!(
        "vcode after regalloc: final version:\n{}",
-        vcode.show_rru(Some(b.reg_universe()))
+        DeferredDisplay::new(|| vcode.show_rru(Some(b.reg_universe())))
    );

    Ok(vcode)
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -64,18 +64,18 @@ use crate::binemit::{CodeInfo, CodeOffset, StackMap};
 use crate::ir::condcodes::IntCC;
 use crate::ir::{Function, SourceLoc, StackSlot, Type, ValueLabel};
 use crate::result::CodegenResult;
-use crate::settings::Flags;
+use crate::settings::{self, Flags};
 use crate::value_label::ValueLabelsRanges;
 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use core::fmt::Debug;
+use core::hash::Hasher;
 use cranelift_entity::PrimaryMap;
 use regalloc::RegUsageCollector;
 use regalloc::{
    RealReg, RealRegUniverse, Reg, RegClass, RegUsageMapper, SpillSlot, VirtualReg, Writable,
 };
 use smallvec::{smallvec, SmallVec};
-use std::hash::Hasher;
 use std::string::String;
 use target_lexicon::Triple;

@@ -368,8 +368,10 @@ pub trait MachBackend {
    /// Return flags for this backend.
    fn flags(&self) -> &Flags;

-    /// Hashes all flags, both ISA-independent and ISA-specific, into the
-    /// specified hasher.
+    /// Get the ISA-dependent flag values that were used to make this trait object.
+    fn isa_flags(&self) -> Vec<settings::Value>;
+
+    /// Hashes all flags, both ISA-independent and ISA-dependent, into the specified hasher.
    fn hash_all_flags(&self, hasher: &mut dyn Hasher);

    /// Return triple for this backend.
--- a/cranelift/codegen/src/regalloc/liverange.rs
+++ b/cranelift/codegen/src/regalloc/liverange.rs
@@ -66,7 +66,7 @@
 //! Our current implementation uses a sorted array of compressed intervals, represented by their
 //! boundaries (Block, Inst), sorted by Block. This is a simple data structure, enables coalescing of
 //! intervals easily, and shows some nice performance behavior. See
-//! https://github.com/bytecodealliance/cranelift/issues/1084 for benchmarks against using a
+//! <https://github.com/bytecodealliance/cranelift/issues/1084> for benchmarks against using a
 //! bforest::Map<Block, Inst>.
 //!
 //! ## block ordering
@@ -112,7 +112,7 @@
 //! the necessary API to make coalescing easy, nor does it optimize for our types' sizes.
 //!
 //! Even the specialized `bforest::Map<Block, Inst>` implementation is slower than a plain sorted
-//! array, see https://github.com/bytecodealliance/cranelift/issues/1084 for details.
+//! array, see <https://github.com/bytecodealliance/cranelift/issues/1084> for details.

 use crate::entity::SparseMapValue;
 use crate::ir::{Block, ExpandedProgramPoint, Inst, Layout, ProgramOrder, ProgramPoint, Value};
--- a/cranelift/codegen/src/result.rs
+++ b/cranelift/codegen/src/result.rs
@@ -2,19 +2,17 @@

 use crate::verifier::VerifierErrors;
 use std::string::String;
-use thiserror::Error;

 /// A compilation error.
 ///
 /// When Cranelift fails to compile a function, it will return one of these error codes.
-#[derive(Error, Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq)]
 pub enum CodegenError {
    /// A list of IR verifier errors.
    ///
    /// This always represents a bug, either in the code that generated IR for Cranelift, or a bug
    /// in Cranelift itself.
-    #[error("Verifier errors")]
-    Verifier(#[from] VerifierErrors),
+    Verifier(VerifierErrors),

    /// An implementation limit was exceeded.
    ///
@@ -22,27 +20,57 @@ pub enum CodegenError {
    /// limits][limits] that cause compilation to fail when they are exceeded.
    ///
    /// [limits]: https://github.com/bytecodealliance/wasmtime/blob/main/cranelift/docs/ir.md#implementation-limits
-    #[error("Implementation limit exceeded")]
    ImplLimitExceeded,

    /// The code size for the function is too large.
    ///
    /// Different target ISAs may impose a limit on the size of a compiled function. If that limit
    /// is exceeded, compilation fails.
-    #[error("Code for function is too large")]
    CodeTooLarge,

    /// Something is not supported by the code generator. This might be an indication that a
    /// feature is used without explicitly enabling it, or that something is temporarily
    /// unsupported by a given target backend.
-    #[error("Unsupported feature: {0}")]
    Unsupported(String),

    /// A failure to map Cranelift register representation to a DWARF register representation.
    #[cfg(feature = "unwind")]
-    #[error("Register mapping error")]
    RegisterMappingError(crate::isa::unwind::systemv::RegisterMappingError),
 }

 /// A convenient alias for a `Result` that uses `CodegenError` as the error type.
 pub type CodegenResult<T> = Result<T, CodegenError>;
+
+// This is manually implementing Error and Display instead of using thiserror to reduce the amount
+// of dependencies used by Cranelift.
+impl std::error::Error for CodegenError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            CodegenError::Verifier(source) => Some(source),
+            CodegenError::ImplLimitExceeded { .. }
+            | CodegenError::CodeTooLarge { .. }
+            | CodegenError::Unsupported { .. } => None,
+            #[cfg(feature = "unwind")]
+            CodegenError::RegisterMappingError { .. } => None,
+        }
+    }
+}
+
+impl std::fmt::Display for CodegenError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            CodegenError::Verifier(_) => write!(f, "Verifier errors"),
+            CodegenError::ImplLimitExceeded => write!(f, "Implementation limit exceeded"),
+            CodegenError::CodeTooLarge => write!(f, "Code for function is too large"),
+            CodegenError::Unsupported(feature) => write!(f, "Unsupported feature: {}", feature),
+            #[cfg(feature = "unwind")]
+            CodegenError::RegisterMappingError(_0) => write!(f, "Register mapping error"),
+        }
+    }
+}
+
+impl From<VerifierErrors> for CodegenError {
+    fn from(source: VerifierErrors) -> Self {
+        CodegenError::Verifier { 0: source }
+    }
+}
--- a/cranelift/codegen/src/settings.rs
+++ b/cranelift/codegen/src/settings.rs
@@ -26,7 +26,6 @@ use alloc::boxed::Box;
 use alloc::string::{String, ToString};
 use core::fmt;
 use core::str;
-use thiserror::Error;

 /// A string-based configurator for settings groups.
 ///
@@ -44,6 +43,78 @@ pub trait Configurable {
    fn enable(&mut self, name: &str) -> SetResult<()>;
 }

+/// Represents the kind of setting.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum SettingKind {
+    /// The setting is an enumeration.
+    Enum,
+    /// The setting is a number.
+    Num,
+    /// The setting is a boolean.
+    Bool,
+    /// The setting is a preset.
+    Preset,
+}
+
+/// Represents an available builder setting.
+///
+/// This is used for iterating settings in a builder.
+#[derive(Clone, Copy, Debug)]
+pub struct Setting {
+    /// The name of the setting.
+    pub name: &'static str,
+    /// The description of the setting.
+    pub description: &'static str,
+    /// The kind of the setting.
+    pub kind: SettingKind,
+    /// The supported values of the setting (for enum values).
+    pub values: Option<&'static [&'static str]>,
+}
+
+/// Represents a setting value.
+///
+/// This is used for iterating values in `Flags`.
+pub struct Value {
+    /// The name of the setting associated with this value.
+    pub name: &'static str,
+    pub(crate) detail: detail::Detail,
+    pub(crate) values: Option<&'static [&'static str]>,
+    pub(crate) value: u8,
+}
+
+impl Value {
+    /// Gets the kind of setting.
+    pub fn kind(&self) -> SettingKind {
+        match &self.detail {
+            detail::Detail::Enum { .. } => SettingKind::Enum,
+            detail::Detail::Num => SettingKind::Num,
+            detail::Detail::Bool { .. } => SettingKind::Bool,
+            detail::Detail::Preset => unreachable!(),
+        }
+    }
+
+    /// Gets the enum value if the value is from an enum setting.
+    pub fn as_enum(&self) -> Option<&'static str> {
+        self.values.map(|v| v[self.value as usize])
+    }
+
+    /// Gets the numerical value if the value is from a num setting.
+    pub fn as_num(&self) -> Option<u8> {
+        match &self.detail {
+            detail::Detail::Num => Some(self.value),
+            _ => None,
+        }
+    }
+
+    /// Gets the boolean value if the value is from a boolean setting.
+    pub fn as_bool(&self) -> Option<bool> {
+        match &self.detail {
+            detail::Detail::Bool { bit } => Some(self.value & (1 << bit) != 0),
+            _ => None,
+        }
+    }
+}
+
 /// Collect settings values based on a template.
 #[derive(Clone, Hash)]
 pub struct Builder {
@@ -66,6 +137,30 @@ impl Builder {
        self.bytes
    }

+    /// Iterates the available settings in the builder.
+    pub fn iter(&self) -> impl Iterator<Item = Setting> {
+        let template = self.template;
+
+        template.descriptors.iter().map(move |d| {
+            let (kind, values) = match d.detail {
+                detail::Detail::Enum { last, enumerators } => {
+                    let values = template.enums(last, enumerators);
+                    (SettingKind::Enum, Some(values))
+                }
+                detail::Detail::Num => (SettingKind::Num, None),
+                detail::Detail::Bool { .. } => (SettingKind::Bool, None),
+                detail::Detail::Preset => (SettingKind::Preset, None),
+            };
+
+            Setting {
+                name: d.name,
+                description: d.description,
+                kind,
+                values,
+            }
+        })
+    }
+
    /// Set the value of a single bit.
    fn set_bit(&mut self, offset: usize, bit: u8, value: bool) {
        let byte = &mut self.bytes[offset];
@@ -165,21 +260,34 @@ impl Configurable for Builder {
 }

 /// An error produced when changing a setting.
-#[derive(Error, Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq)]
 pub enum SetError {
    /// No setting by this name exists.
-    #[error("No existing setting named '{0}'")]
    BadName(String),

    /// Type mismatch for setting (e.g., setting an enum setting as a bool).
-    #[error("Trying to set a setting with the wrong type")]
    BadType,

    /// This is not a valid value for this setting.
-    #[error("Unexpected value for a setting, expected {0}")]
    BadValue(String),
 }

+impl std::error::Error for SetError {}
+
+impl fmt::Display for SetError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            SetError::BadName(name) => write!(f, "No existing setting named '{}'", name),
+            SetError::BadType => {
+                write!(f, "Trying to set a setting with the wrong type")
+            }
+            SetError::BadValue(value) => {
+                write!(f, "Unexpected value for a setting, expected {}", value)
+            }
+        }
+    }
+}
+
 /// A result returned when changing a setting.
 pub type SetResult<T> = Result<T, SetError>;

@@ -288,6 +396,9 @@ pub mod detail {
        /// Lower snake-case name of setting as defined in meta.
        pub name: &'static str,

+        /// The description of the setting.
+        pub description: &'static str,
+
        /// Offset of byte containing this setting.
        pub offset: u32,

--- a/cranelift/codegen/src/verifier/mod.rs
+++ b/cranelift/codegen/src/verifier/mod.rs
@@ -80,7 +80,6 @@ use alloc::vec::Vec;
 use core::cmp::Ordering;
 use core::fmt::{self, Display, Formatter, Write};
 use log::debug;
-use thiserror::Error;

 pub use self::cssa::verify_cssa;
 pub use self::liveness::verify_liveness;
@@ -92,8 +91,7 @@ mod liveness;
 mod locations;

 /// A verifier error.
-#[derive(Error, Debug, PartialEq, Eq, Clone)]
-#[error("{}{}: {}", .location, format_context(.context), .message)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct VerifierError {
    /// The entity causing the verifier error.
    pub location: AnyEntity,
@@ -104,11 +102,16 @@ pub struct VerifierError {
    pub message: String,
 }

-/// Helper for formatting Verifier::Error context.
-fn format_context(context: &Option<String>) -> String {
-    match context {
-        None => "".to_string(),
-        Some(c) => format!(" ({})", c),
+// This is manually implementing Error and Display instead of using thiserror to reduce the amount
+// of dependencies used by Cranelift.
+impl std::error::Error for VerifierError {}
+
+impl Display for VerifierError {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        match &self.context {
+            None => write!(f, "{}: {}", self.location, self.message),
+            Some(context) => write!(f, "{} ({}): {}", self.location, context, self.message),
+        }
    }
 }

@@ -175,9 +178,13 @@ pub type VerifierStepResult<T> = Result<T, ()>;
 pub type VerifierResult<T> = Result<T, VerifierErrors>;

 /// List of verifier errors.
-#[derive(Error, Debug, Default, PartialEq, Eq, Clone)]
+#[derive(Debug, Default, PartialEq, Eq, Clone)]
 pub struct VerifierErrors(pub Vec<VerifierError>);

+// This is manually implementing Error and Display instead of using thiserror to reduce the amount
+// of dependencies used by Cranelift.
+impl std::error::Error for VerifierErrors {}
+
 impl VerifierErrors {
    /// Return a new `VerifierErrors` struct.
    #[inline]
--- a/cranelift/entity/Cargo.toml
+++ b/cranelift/entity/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 authors = ["The Cranelift Project Developers"]
 name = "cranelift-entity"
-version = "0.72.0"
+version = "0.73.0"
 description = "Data structures using entity references as mapping keys"
 license = "Apache-2.0 WITH LLVM-exception"
 documentation = "https://docs.rs/cranelift-entity"
--- a/cranelift/entity/src/lib.rs
+++ b/cranelift/entity/src/lib.rs
@@ -70,21 +70,25 @@ macro_rules! entity_impl {
    // Basic traits.
    ($entity:ident) => {
        impl $crate::EntityRef for $entity {
+            #[inline]
            fn new(index: usize) -> Self {
                debug_assert!(index < ($crate::__core::u32::MAX as usize));
                $entity(index as u32)
            }

+            #[inline]
            fn index(self) -> usize {
                self.0 as usize
            }
        }

        impl $crate::packed_option::ReservedValue for $entity {
+            #[inline]
            fn reserved_value() -> $entity {
                $entity($crate::__core::u32::MAX)
            }

+            #[inline]
            fn is_reserved_value(&self) -> bool {
                self.0 == $crate::__core::u32::MAX
            }
@@ -93,6 +97,7 @@ macro_rules! entity_impl {
        impl $entity {
            /// Create a new instance from a `u32`.
            #[allow(dead_code)]
+            #[inline]
            pub fn from_u32(x: u32) -> Self {
                debug_assert!(x < $crate::__core::u32::MAX);
                $entity(x)
@@ -100,6 +105,7 @@ macro_rules! entity_impl {

            /// Return the underlying index value as a `u32`.
            #[allow(dead_code)]
+            #[inline]
            pub fn as_u32(self) -> u32 {
                self.0
            }
--- a/cranelift/entity/src/primary.rs
+++ b/cranelift/entity/src/primary.rs
@@ -148,6 +148,28 @@ where
    pub fn into_boxed_slice(self) -> BoxedSlice<K, V> {
        unsafe { BoxedSlice::<K, V>::from_raw(Box::<[V]>::into_raw(self.elems.into_boxed_slice())) }
    }
+
+    /// Performs a binary search on the values with a key extraction function.
+    ///
+    /// Assumes that the values are sorted by the key extracted by the function.
+    ///
+    /// If the value is found then `Ok(K)` is returned, containing the entity key
+    /// of the matching value.
+    ///
+    /// If there are multiple matches, then any one of the matches could be returned.
+    ///
+    /// If the value is not found then Err(K) is returned, containing the entity key
+    /// where a matching element could be inserted while maintaining sorted order.
+    pub fn binary_search_values_by_key<'a, B, F>(&'a self, b: &B, f: F) -> Result<K, K>
+    where
+        F: FnMut(&'a V) -> B,
+        B: Ord,
+    {
+        self.elems
+            .binary_search_by_key(b, f)
+            .map(|i| K::new(i))
+            .map_err(|i| K::new(i))
+    }
 }

 impl<K, V> Default for PrimaryMap<K, V>
--- a/cranelift/filetests/Cargo.toml
+++ b/cranelift/filetests/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "cranelift-filetests"
 authors = ["The Cranelift Project Developers"]
-version = "0.66.0"
+version = "0.73.0"
 description = "Test driver and implementations of the filetest commands"
 license = "Apache-2.0 WITH LLVM-exception"
 documentation = "https://docs.rs/cranelift-filetests"
@@ -10,24 +10,22 @@ publish = false
 edition = "2018"

 [dependencies]
-cranelift-codegen = { path = "../codegen", version = "0.72.0", features = ["testing_hooks"] }
-cranelift-frontend = { path = "../frontend", version = "0.72.0" }
-cranelift-interpreter = { path = "../interpreter", version = "0.72.0" }
-cranelift-native = { path = "../native", version = "0.72.0" }
-cranelift-reader = { path = "../reader", version = "0.72.0" }
-cranelift-preopt = { path = "../preopt", version = "0.72.0" }
-byteorder = { version = "1.3.2", default-features = false }
+cranelift-codegen = { path = "../codegen", version = "0.73.0", features = ["testing_hooks"] }
+cranelift-frontend = { path = "../frontend", version = "0.73.0" }
+cranelift-interpreter = { path = "../interpreter", version = "0.73.0" }
+cranelift-native = { path = "../native", version = "0.73.0" }
+cranelift-reader = { path = "../reader", version = "0.73.0" }
+cranelift-preopt = { path = "../preopt", version = "0.73.0" }
 file-per-thread-logger = "0.1.2"
 filecheck = "0.5.0"
-gimli = { version = "0.23.0", default-features = false, features = ["read"] }
+gimli = { version = "0.24.0", default-features = false, features = ["read"] }
 log = "0.4.6"
 memmap2 = "0.2.1"
 num_cpus = "1.8.0"
-target-lexicon = "0.11"
+target-lexicon = "0.12"
 thiserror = "1.0.15"
 anyhow = "1.0.32"

 [features]
 enable-peepmatic = []
 experimental_arm32 = []
-experimental_x64 = []
--- a/cranelift/filetests/filetests/isa/aarch64/prologue.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/prologue.clif
@@ -77,22 +77,72 @@ block0(v0: f64):

 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: str q8, [sp, #-16]!
-; nextln: str q9, [sp, #-16]!
-; nextln: str q10, [sp, #-16]!
-; nextln: str q11, [sp, #-16]!
-; nextln: str q12, [sp, #-16]!
-; nextln: str q13, [sp, #-16]!
-; nextln: str q14, [sp, #-16]!
-; nextln: str q15, [sp, #-16]!
+; nextln: stp d14, d15, [sp, #-16]!
+; nextln: stp d12, d13, [sp, #-16]!
+; nextln: stp d10, d11, [sp, #-16]!
+; nextln: stp d8, d9, [sp, #-16]!

-; check: ldr q15, [sp], #16
-; nextln: ldr q14, [sp], #16
-; nextln: ldr q13, [sp], #16
-; nextln: ldr q12, [sp], #16
-; nextln: ldr q11, [sp], #16
-; nextln: ldr q10, [sp], #16
-; nextln: ldr q9, [sp], #16
-; nextln: ldr q8, [sp], #16
+; check: ldp d8, d9, [sp], #16
+; nextln: ldp d10, d11, [sp], #16
+; nextln: ldp d12, d13, [sp], #16
+; nextln: ldp d14, d15, [sp], #16
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f2(i64) -> i64 {
+block0(v0: i64):
+    v1 = iadd.i64 v0, v0
+    v2 = iadd.i64 v0, v1
+    v3 = iadd.i64 v0, v2
+    v4 = iadd.i64 v0, v3
+    v5 = iadd.i64 v0, v4
+    v6 = iadd.i64 v0, v5
+    v7 = iadd.i64 v0, v6
+    v8 = iadd.i64 v0, v7
+    v9 = iadd.i64 v0, v8
+    v10 = iadd.i64 v0, v9
+    v11 = iadd.i64 v0, v10
+    v12 = iadd.i64 v0, v11
+    v13 = iadd.i64 v0, v12
+    v14 = iadd.i64 v0, v13
+    v15 = iadd.i64 v0, v14
+    v16 = iadd.i64 v0, v15
+    v17 = iadd.i64 v0, v16
+    v18 = iadd.i64 v0, v17
+
+    v19 = iadd.i64 v0, v1
+    v20 = iadd.i64 v2, v3
+    v21 = iadd.i64 v4, v5
+    v22 = iadd.i64 v6, v7
+    v23 = iadd.i64 v8, v9
+    v24 = iadd.i64 v10, v11
+    v25 = iadd.i64 v12, v13
+    v26 = iadd.i64 v14, v15
+    v27 = iadd.i64 v16, v17
+
+    v28 = iadd.i64 v18, v19
+    v29 = iadd.i64 v20, v21
+    v30 = iadd.i64 v22, v23
+    v31 = iadd.i64 v24, v25
+    v32 = iadd.i64 v26, v27
+
+    v33 = iadd.i64 v28, v29
+    v34 = iadd.i64 v30, v31
+
+    v35 = iadd.i64 v32, v33
+    v36 = iadd.i64 v34, v35
+
+    return v36
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: str x22, [sp, #-16]!
+; nextln: stp x19, x20, [sp, #-16]!
+; nextln: add x1, x0, x0
+
+; check: add x0, x1, x0
+; nextln: ldp x19, x20, [sp], #16
+; nextln: ldr x22, [sp], #16
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
--- a/cranelift/filetests/filetests/isa/s390x/arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/s390x/arithmetic.clif
--- a/cranelift/filetests/filetests/isa/s390x/bitops.clif
+++ b/cranelift/filetests/filetests/isa/s390x/bitops.clif
@@ -0,0 +1,243 @@
+test compile
+target s390x
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; BITREV
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; FIXME: bitrev not yet implemented
+
+;function %bitrev_i64(i64) -> i64 {
+;block0(v0: i64):
+;    v1 = bitrev v0
+;    return v1
+;}
+;
+;function %bitrev_i32(i32) -> i32 {
+;block0(v0: i32):
+;    v1 = bitrev v0
+;    return v1
+;}
+;
+;function %bitrev_i16(i16) -> i16 {
+;block0(v0: i16):
+;    v1 = bitrev v0
+;    return v1
+;}
+;
+;function %bitrev_i8(i8) -> i8 {
+;block0(v0: i8):
+;    v1 = bitrev v0
+;    return v1
+;}
+;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; CLZ
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %clz_i64(i64) -> i64 {
+block0(v0: i64):
+    v1 = clz v0
+    return v1
+}
+
+; check:  flogr %r0, %r2
+; nextln: lgr %r2, %r0
+; nextln: br %r14
+
+function %clz_i32(i32) -> i32 {
+block0(v0: i32):
+    v1 = clz v0
+    return v1
+}
+
+; check:  llgfr %r2, %r2
+; nextln: flogr %r0, %r2
+; nextln: lr %r2, %r0
+; nextln: ahi %r2, -32
+; nextln: br %r14
+
+function %clz_i16(i16) -> i16 {
+block0(v0: i16):
+    v1 = clz v0
+    return v1
+}
+
+; check:  llghr %r2, %r2
+; nextln: flogr %r0, %r2
+; nextln: lr %r2, %r0
+; nextln: ahi %r2, -48
+; nextln: br %r14
+
+function %clz_i8(i8) -> i8 {
+block0(v0: i8):
+    v1 = clz v0
+    return v1
+}
+
+; check:  llgcr %r2, %r2
+; nextln: flogr %r0, %r2
+; nextln: lr %r2, %r0
+; nextln: ahi %r2, -56
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; CLS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %cls_i64(i64) -> i64 {
+block0(v0: i64):
+    v1 = cls v0
+    return v1
+}
+
+; check:  srag %r3, %r2, 63
+; nextln: xgr %r3, %r2
+; nextln: flogr %r0, %r2
+; nextln: lgr %r2, %r0
+; nextln: br %r14
+
+function %cls_i32(i32) -> i32 {
+block0(v0: i32):
+    v1 = cls v0
+    return v1
+}
+
+; check:  lgfr %r2, %r2
+; nextln: srag %r3, %r2, 63
+; nextln: xgr %r3, %r2
+; nextln: flogr %r0, %r2
+; nextln: lr %r2, %r0
+; nextln: ahi %r2, -32
+; nextln: br %r14
+
+function %cls_i16(i16) -> i16 {
+block0(v0: i16):
+    v1 = cls v0
+    return v1
+}
+
+; check:  lghr %r2, %r2
+; nextln: srag %r3, %r2, 63
+; nextln: xgr %r3, %r2
+; nextln: flogr %r0, %r2
+; nextln: lr %r2, %r0
+; nextln: ahi %r2, -48
+; nextln: br %r14
+
+function %cls_i8(i8) -> i8 {
+block0(v0: i8):
+    v1 = cls v0
+    return v1
+}
+
+; check:  lgbr %r2, %r2
+; nextln: srag %r3, %r2, 63
+; nextln: xgr %r3, %r2
+; nextln: flogr %r0, %r2
+; nextln: lr %r2, %r0
+; nextln: ahi %r2, -56
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; CTZ
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %ctz_i64(i64) -> i64 {
+block0(v0: i64):
+    v1 = ctz v0
+    return v1
+}
+
+; check:  lcgr %r3, %r2
+; nextln: ngrk %r2, %r3, %r2
+; nextln: flogr %r0, %r2
+; nextln: locghie %r0, -1
+; nextln: lghi %r2, 63
+; nextln: sgr %r2, %r0
+; nextln: br %r14
+
+function %ctz_i32(i32) -> i32 {
+block0(v0: i32):
+    v1 = ctz v0
+    return v1
+}
+
+; check:  oihl %r2, 1
+; nextln: lcgr %r3, %r2
+; nextln: ngrk %r2, %r3, %r2
+; nextln: flogr %r0, %r2
+; nextln: lhi %r2, 63
+; nextln: sr %r2, %r0
+; nextln: br %r14
+
+function %ctz_i16(i16) -> i16 {
+block0(v0: i16):
+    v1 = ctz v0
+    return v1
+}
+
+; check:  oilh %r2, 1
+; nextln: lcgr %r3, %r2
+; nextln: ngrk %r2, %r3, %r2
+; nextln: flogr %r0, %r2
+; nextln: lhi %r2, 63
+; nextln: sr %r2, %r0
+; nextln: br %r14
+
+function %ctz_i8(i8) -> i8 {
+block0(v0: i8):
+    v1 = ctz v0
+    return v1
+}
+
+; check:  oill %r2, 256
+; nextln: lcgr %r3, %r2
+; nextln: ngrk %r2, %r3, %r2
+; nextln: flogr %r0, %r2
+; nextln: lhi %r2, 63
+; nextln: sr %r2, %r0
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; POPCNT
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %popcnt_i64(i64) -> i64 {
+block0(v0: i64):
+    v1 = popcnt v0
+    return v1
+}
+
+; check:  popcnt %r2, %r2, 8
+; nextln: br %r14
+
+function %popcnt_i32(i32) -> i32 {
+block0(v0: i32):
+    v1 = popcnt v0
+    return v1
+}
+
+; check:  llgfr %r2, %r2
+; nextln: popcnt %r2, %r2, 8
+; nextln: br %r14
+
+function %popcnt_i16(i16) -> i16 {
+block0(v0: i16):
+    v1 = popcnt v0
+    return v1
+}
+
+; check:  llghr %r2, %r2
+; nextln: popcnt %r2, %r2, 8
+; nextln: br %r14
+
+function %popcnt_i8(i8) -> i8 {
+block0(v0: i8):
+    v1 = popcnt v0
+    return v1
+}
+
+; check: popcnt %r2, %r2
+; nextln: br %r14
--- a/cranelift/filetests/filetests/isa/s390x/bitwise.clif
+++ b/cranelift/filetests/filetests/isa/s390x/bitwise.clif
@@ -0,0 +1,490 @@
+
+test compile
+target s390x
+
+; FIXME: add immediate operand versions
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; BAND
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %band_i64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = band.i64 v0, v1
+  return v2
+}
+
+; check:  ngr %r2, %r3
+; nextln: br %r14
+
+function %band_i64_mem(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = load.i64 v1
+  v3 = band.i64 v0, v2
+  return v3
+}
+
+; check:  ng %r2, 0(%r3)
+; nextln: br %r14
+
+function %band_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = band.i32 v0, v1
+  return v2
+}
+
+; check:  nr %r2, %r3
+; nextln: br %r14
+
+function %band_i32_mem(i32, i64) -> i32 {
+block0(v0: i32, v1: i64):
+  v2 = load.i32 v1
+  v3 = band.i32 v0, v2
+  return v3
+}
+
+; check:  n %r2, 0(%r3)
+; nextln: br %r14
+
+function %band_i32_memoff(i32, i64) -> i32 {
+block0(v0: i32, v1: i64):
+  v2 = load.i32 v1+4096
+  v3 = band.i32 v0, v2
+  return v3
+}
+
+; check:  ny %r2, 4096(%r3)
+; nextln: br %r14
+
+function %band_i16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = band.i16 v0, v1
+  return v2
+}
+
+; check:  nr %r2, %r3
+; nextln: br %r14
+
+function %band_i16_mem(i16, i64) -> i16 {
+block0(v0: i16, v1: i64):
+  v2 = load.i16 v1
+  v3 = band.i16 v0, v2
+  return v3
+}
+
+; check:  llh %r3, 0(%r3)
+; nextln: nr %r2, %r3
+; nextln: br %r14
+
+function %band_i8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = band.i8 v0, v1
+  return v2
+}
+
+; check:  nr %r2, %r3
+; nextln: br %r14
+
+function %band_i8_mem(i8, i64) -> i8 {
+block0(v0: i8, v1: i64):
+  v2 = load.i8 v1
+  v3 = band.i8 v0, v2
+  return v3
+}
+
+; check:  llc %r3, 0(%r3)
+; nextln: nr %r2, %r3
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; BOR
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %bor_i64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bor.i64 v0, v1
+  return v2
+}
+
+; check:  ogr %r2, %r3
+; nextln: br %r14
+
+function %bor_i64_mem(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = load.i64 v1
+  v3 = bor.i64 v0, v2
+  return v3
+}
+
+; check:  og %r2, 0(%r3)
+; nextln: br %r14
+
+function %bor_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bor.i32 v0, v1
+  return v2
+}
+
+; check:  or %r2, %r3
+; nextln: br %r14
+
+function %bor_i32_mem(i32, i64) -> i32 {
+block0(v0: i32, v1: i64):
+  v2 = load.i32 v1
+  v3 = bor.i32 v0, v2
+  return v3
+}
+
+; check:  o %r2, 0(%r3)
+; nextln: br %r14
+
+function %bor_i32_memoff(i32, i64) -> i32 {
+block0(v0: i32, v1: i64):
+  v2 = load.i32 v1+4096
+  v3 = bor.i32 v0, v2
+  return v3
+}
+
+; check:  oy %r2, 4096(%r3)
+; nextln: br %r14
+
+function %bor_i16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = bor.i16 v0, v1
+  return v2
+}
+
+; check:  or %r2, %r3
+; nextln: br %r14
+
+function %bor_i16_mem(i16, i64) -> i16 {
+block0(v0: i16, v1: i64):
+  v2 = load.i16 v1
+  v3 = bor.i16 v0, v2
+  return v3
+}
+
+; check:  llh %r3, 0(%r3)
+; nextln: or %r2, %r3
+; nextln: br %r14
+
+function %bor_i8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = bor.i8 v0, v1
+  return v2
+}
+
+; check:  or %r2, %r3
+; nextln: br %r14
+
+function %bor_i8_mem(i8, i64) -> i8 {
+block0(v0: i8, v1: i64):
+  v2 = load.i8 v1
+  v3 = bor.i8 v0, v2
+  return v3
+}
+
+; check:  llc %r3, 0(%r3)
+; nextln: or %r2, %r3
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; BXOR
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %bxor_i64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bxor.i64 v0, v1
+  return v2
+}
+
+; check:  xgr %r2, %r3
+; nextln: br %r14
+
+function %bxor_i64_mem(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = load.i64 v1
+  v3 = bxor.i64 v0, v2
+  return v3
+}
+
+; check:  xg %r2, 0(%r3)
+; nextln: br %r14
+
+function %bxor_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bxor.i32 v0, v1
+  return v2
+}
+
+; check:  xr %r2, %r3
+; nextln: br %r14
+
+function %bxor_i32_mem(i32, i64) -> i32 {
+block0(v0: i32, v1: i64):
+  v2 = load.i32 v1
+  v3 = bxor.i32 v0, v2
+  return v3
+}
+
+; check:  x %r2, 0(%r3)
+; nextln: br %r14
+
+function %bxor_i32_memoff(i32, i64) -> i32 {
+block0(v0: i32, v1: i64):
+  v2 = load.i32 v1+4096
+  v3 = bxor.i32 v0, v2
+  return v3
+}
+
+; check:  xy %r2, 4096(%r3)
+; nextln: br %r14
+
+function %bxor_i16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = bxor.i16 v0, v1
+  return v2
+}
+
+; check:  xr %r2, %r3
+; nextln: br %r14
+
+function %bxor_i16_mem(i16, i64) -> i16 {
+block0(v0: i16, v1: i64):
+  v2 = load.i16 v1
+  v3 = bxor.i16 v0, v2
+  return v3
+}
+
+; check:  llh %r3, 0(%r3)
+; nextln: xr %r2, %r3
+; nextln: br %r14
+
+function %bxor_i8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = bxor.i8 v0, v1
+  return v2
+}
+
+; check:  xr %r2, %r3
+; nextln: br %r14
+
+function %bxor_i8_mem(i8, i64) -> i8 {
+block0(v0: i8, v1: i64):
+  v2 = load.i8 v1
+  v3 = bxor.i8 v0, v2
+  return v3
+}
+
+; check:  llc %r3, 0(%r3)
+; nextln: xr %r2, %r3
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; BAND_NOT
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %band_not_i64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = band_not.i64 v0, v1
+  return v2
+}
+
+; check:  nngrk %r2, %r2, %r3
+; nextln: br %r14
+
+function %band_not_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = band_not.i32 v0, v1
+  return v2
+}
+
+; check:  nnrk %r2, %r2, %r3
+; nextln: br %r14
+
+function %band_not_i16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = band_not.i16 v0, v1
+  return v2
+}
+
+; check:  nnrk %r2, %r2, %r3
+; nextln: br %r14
+
+function %band_not_i8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = band_not.i8 v0, v1
+  return v2
+}
+
+; check:  nnrk %r2, %r2, %r3
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; BOR_NOT
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %bor_not_i64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bor_not.i64 v0, v1
+  return v2
+}
+
+; check:  nogrk %r2, %r2, %r3
+; nextln: br %r14
+
+function %bor_not_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bor_not.i32 v0, v1
+  return v2
+}
+
+; check:  nork %r2, %r2, %r3
+; nextln: br %r14
+
+function %bor_not_i16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = bor_not.i16 v0, v1
+  return v2
+}
+
+; check:  nork %r2, %r2, %r3
+; nextln: br %r14
+
+function %bor_not_i8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = bor_not.i8 v0, v1
+  return v2
+}
+
+; check:  nork %r2, %r2, %r3
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; BXOR_NOT
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %bxor_not_i64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bxor_not.i64 v0, v1
+  return v2
+}
+
+; check:  nxgrk %r2, %r2, %r3
+; nextln: br %r14
+
+function %bxor_not_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bxor_not.i32 v0, v1
+  return v2
+}
+
+; check:  nxrk %r2, %r2, %r3
+; nextln: br %r14
+
+function %bxor_not_i16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = bxor_not.i16 v0, v1
+  return v2
+}
+
+; check:  nxrk %r2, %r2, %r3
+; nextln: br %r14
+
+function %bxor_not_i8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = bxor_not.i8 v0, v1
+  return v2
+}
+
+; check:  nxrk %r2, %r2, %r3
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; BNOT
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %bnot_i64(i64) -> i64 {
+block0(v0: i64):
+  v1 = bnot.i64 v0
+  return v1
+}
+
+; check:  nogrk %r2, %r2, %r2
+; nextln: br %r14
+
+function %bnot_i32(i32) -> i32 {
+block0(v0: i32):
+  v1 = bnot.i32 v0
+  return v1
+}
+
+; check:  nork %r2, %r2, %r2
+; nextln: br %r14
+
+function %bnot_i16(i16) -> i16 {
+block0(v0: i16):
+  v1 = bnot.i16 v0
+  return v1
+}
+
+; check:  nork %r2, %r2, %r2
+; nextln: br %r14
+
+function %bnot_i8(i8) -> i8 {
+block0(v0: i8):
+  v1 = bnot.i8 v0
+  return v1
+}
+
+; check:  nork %r2, %r2, %r2
+; nextln: br %r14
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; BITSELECT
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %bitselect_i64(i64, i64, i64) -> i64 {
+block0(v0: i64, v1: i64, v2: i64):
+  v3 = bitselect.i64 v0, v1, v2
+  return v3
+}
+
+; check:  ngr %r3, %r2
+; nextln: nngrk %r2, %r4, %r2
+; nextln: ogr %r2, %r3
+; nextln: br %r14
+
+function %bitselect_i32(i32, i32, i32) -> i32 {
+block0(v0: i32, v1: i32, v2: i32):
+  v3 = bitselect.i32 v0, v1, v2
+  return v3
+}
+
+; check:  nr %r3, %r2
+; nextln: nnrk %r2, %r4, %r2
+; nextln: or %r2, %r3
+; nextln: br %r14
+
+function %bitselect_i16(i16, i16, i16) -> i16 {
+block0(v0: i16, v1: i16, v2: i16):
+  v3 = bitselect.i16 v0, v1, v2
+  return v3
+}
+
+; check:  nr %r3, %r2
+; nextln: nnrk %r2, %r4, %r2
+; nextln: or %r2, %r3
+; nextln: br %r14
+
+function %bitselect_i8(i8, i8, i8) -> i8 {
+block0(v0: i8, v1: i8, v2: i8):
+  v3 = bitselect.i8 v0, v1, v2
+  return v3
+}
+
+; check:  nr %r3, %r2
+; nextln: nnrk %r2, %r4, %r2
+; nextln: or %r2, %r3
+; nextln: br %r14
+
--- a/cranelift/filetests/filetests/isa/s390x/call.clif
+++ b/cranelift/filetests/filetests/isa/s390x/call.clif
@@ -0,0 +1,113 @@
+test compile
+target s390x
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; CALL
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %call(i64) -> i64 {
+    fn0 = %g(i64) -> i64
+
+block0(v0: i64):
+    v1 = call fn0(v0)
+    return v1
+}
+
+; check:  stmg %r14, %r15, 112(%r15)
+; nextln: aghi %r15, -160
+; nextln: virtual_sp_offset_adjust 160
+; nextln: bras %r1, 12 ; data %g + 0 ; lg %r3, 0(%r1)
+; nextln: basr %r14, %r3
+; nextln: lmg %r14, %r15, 272(%r15)
+; nextln: br %r14
+
+function %call_uext(i32) -> i64 {
+    fn0 = %g(i32 uext) -> i64
+
+block0(v0: i32):
+    v1 = call fn0(v0)
+    return v1
+}
+
+; check:  stmg %r14, %r15, 112(%r15)
+; nextln: aghi %r15, -160
+; nextln: virtual_sp_offset_adjust 160
+; nextln: llgfr %r2, %r2
+; nextln: bras %r1, 12 ; data %g + 0 ; lg %r3, 0(%r1)
+; nextln: basr %r14, %r3
+; nextln: lmg %r14, %r15, 272(%r15)
+; nextln: br %r14
+
+function %ret_uext(i32) -> i32 uext {
+block0(v0: i32):
+    return v0
+}
+
+; check:  llgfr %r2, %r2
+; nextln: br %r14
+
+function %call_uext(i32) -> i64 {
+    fn0 = %g(i32 sext) -> i64
+
+block0(v0: i32):
+    v1 = call fn0(v0)
+    return v1
+}
+
+; check:  stmg %r14, %r15, 112(%r15)
+; nextln: aghi %r15, -160
+; nextln: virtual_sp_offset_adjust 160
+; nextln: lgfr %r2, %r2
+; nextln: bras %r1, 12 ; data %g + 0 ; lg %r3, 0(%r1)
+; nextln: basr %r14, %r3
+; nextln: lmg %r14, %r15, 272(%r15)
+; nextln: br %r14
+
+function %ret_uext(i32) -> i32 sext {
+block0(v0: i32):
+    return v0
+}
+
+; check:  lgfr %r2, %r2
+; nextln: br %r14
+
+function %call_colocated(i64) -> i64 {
+    fn0 = colocated %g(i64) -> i64
+
+block0(v0: i64):
+    v1 = call fn0(v0)
+    return v1
+}
+
+; check:  stmg %r14, %r15, 112(%r15)
+; nextln: aghi %r15, -160
+; nextln: virtual_sp_offset_adjust 160
+; nextln: brasl %r14, %g
+; nextln: lmg %r14, %r15, 272(%r15)
+; nextln: br %r14
+
+function %f2(i32) -> i64 {
+    fn0 = %g(i32 uext) -> i64
+
+block0(v0: i32):
+    v1 = call fn0(v0)
+    return v1
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; CALL_INDIRECT
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %call_indirect(i64, i64) -> i64 {
+    sig0 = (i64) -> i64
+block0(v0: i64, v1: i64):
+    v2 = call_indirect.i64 sig0, v1(v0)
+    return v2
+}
+
+; check:  stmg %r14, %r15, 112(%r15)
+; nextln: aghi %r15, -160
+; nextln: virtual_sp_offset_adjust 160
+; nextln: basr %r14, %r3
+; nextln: lmg %r14, %r15, 272(%r15)
+; nextln: br %r14
--- a/cranelift/filetests/filetests/isa/s390x/condbr.clif
+++ b/cranelift/filetests/filetests/isa/s390x/condbr.clif
@@ -0,0 +1,62 @@
+test compile
+target s390x
+
+function %f(i64, i64) -> b1 {
+block0(v0: i64, v1: i64):
+  v2 = icmp eq v0, v1
+  return v2
+}
+
+; check:  clgr %r2, %r3
+; nextln: lhi %r2, 0
+; nextln: lochie %r2, 1
+; nextln: br %r14
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = icmp eq v0, v1
+  brnz v2, block1
+  jump block2
+
+block1:
+  v4 = iconst.i64 1
+  return v4
+
+block2:
+  v5 = iconst.i64 2
+  return v5
+}
+
+; check: Block 0:
+; check:  clgr %r2, %r3
+; nextln: jge label1 ; jg label2
+; check: Block 1:
+; check:  lghi %r2, 1
+; nextln: br %r14
+; check: Block 2:
+; check:  lghi %r2, 2
+; nextln: br %r14
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = icmp eq v0, v1
+  brnz v2, block1
+  jump block1
+
+block1:
+  v4 = iconst.i64 1
+  return v4
+}
+
+; FIXME: Should optimize away branches
+
+; check: Block 0:
+; check:  clgr %r2, %r3
+; nextln: jge label1 ; jg label2
+; check: Block 1:
+; check:  jg label3
+; check: Block 2:
+; check:  jg label3
+; check: Block 3:
+; check:  lghi %r2, 1
+; nextln: br %r14
--- a/cranelift/filetests/filetests/isa/s390x/condops.clif
+++ b/cranelift/filetests/filetests/isa/s390x/condops.clif
@@ -0,0 +1,43 @@
+test compile
+target s390x
+
+function %f(i8, i64, i64) -> i64 {
+block0(v0: i8, v1: i64, v2: i64):
+  v3 = iconst.i8 42
+  v4 = icmp eq v0, v3
+  v5 = select.i64 v4, v1, v2
+  return v5
+}
+
+; check:  llcr %r2, %r2
+; nextln: clfi %r2, 42
+; nextln: locgre %r4, %r3
+; nextln: lgr %r2, %r4
+; nextln: br %r14
+
+function %g(b1, i8, i8) -> i8 {
+block0(v0: b1, v1: i8, v2: i8):
+  v3 = select.i8 v0, v1, v2
+  return v3
+}
+
+; FIXME: optimize i8/i16 compares
+
+; check:  llcr %r2, %r2
+; nextln: chi %r2, 0
+; nextln: locrlh %r4, %r3
+; nextln: lr %r2, %r4
+; nextln: br %r14
+
+function %i(i32, i8, i8) -> i8 {
+block0(v0: i32, v1: i8, v2: i8):
+  v3 = iconst.i32 42
+  v4 = icmp.i32 eq v0, v3
+  v5 = select.i8 v4, v1, v2
+  return v5
+}
+
+; check:  clfi %r2, 42
+; nextln: locre %r4, %r3
+; nextln: lr %r2, %r4
+; nextln: br %r14
--- a/Show More
+++ b/Show More