From 95b0b05af283ded8a6130856c4d385ea49833ef1 Mon Sep 17 00:00:00 2001
From: Anton Kirilov <anton.kirilov@arm.com>
Date: Fri, 19 Jun 2020 01:00:47 +0100
Subject: [PATCH 01/11] AArch64: Introduce an enum to specify vector
 instruction operand sizes

Copyright (c) 2020, Arm Limited.
---
 .../codegen/src/isa/aarch64/inst/args.rs      |  53 ++++
 .../codegen/src/isa/aarch64/inst/emit.rs      | 133 ++++------
 .../src/isa/aarch64/inst/emit_tests.rs        | 250 +++++++++---------
 cranelift/codegen/src/isa/aarch64/inst/mod.rs | 188 ++++++-------
 .../codegen/src/isa/aarch64/inst/regs.rs      |  50 ++--
 cranelift/codegen/src/isa/aarch64/lower.rs    |  17 +-
 .../codegen/src/isa/aarch64/lower_inst.rs     |  71 +++--
 7 files changed, 374 insertions(+), 388 deletions(-)

diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index 6bbd618685..43e8471ac7 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -3,6 +3,7 @@
 // Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]
 
+use crate::ir::types::{F32X2, F32X4, F64X2, I16X4, I16X8, I32X2, I32X4, I64X2, I8X16, I8X8};
 use crate::ir::Type;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::lower::ty_bits;
@@ -587,3 +588,55 @@ impl ScalarSize {
         }
     }
 }
+
+/// Type used to communicate the size of a vector operand.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum VectorSize {
+    Size8x8,
+    Size8x16,
+    Size16x4,
+    Size16x8,
+    Size32x2,
+    Size32x4,
+    Size64x2,
+}
+
+impl VectorSize {
+    /// Convert from a type into a vector operand size.
+    pub fn from_ty(ty: Type) -> VectorSize {
+        match ty {
+            F32X2 => VectorSize::Size32x2,
+            F32X4 => VectorSize::Size32x4,
+            F64X2 => VectorSize::Size64x2,
+            I8X8 => VectorSize::Size8x8,
+            I8X16 => VectorSize::Size8x16,
+            I16X4 => VectorSize::Size16x4,
+            I16X8 => VectorSize::Size16x8,
+            I32X2 => VectorSize::Size32x2,
+            I32X4 => VectorSize::Size32x4,
+            I64X2 => VectorSize::Size64x2,
+            _ => unimplemented!(),
+        }
+    }
+
+    /// Get the integer operand size that corresponds to a lane of a vector with a certain size.
+    pub fn operand_size(&self) -> OperandSize {
+        match self {
+            VectorSize::Size64x2 => OperandSize::Size64,
+            _ => OperandSize::Size32,
+        }
+    }
+
+    /// Get the scalar operand size that corresponds to a lane of a vector with a certain size.
+    pub fn lane_size(&self) -> ScalarSize {
+        match self {
+            VectorSize::Size8x8 => ScalarSize::Size8,
+            VectorSize::Size8x16 => ScalarSize::Size8,
+            VectorSize::Size16x4 => ScalarSize::Size16,
+            VectorSize::Size16x8 => ScalarSize::Size16,
+            VectorSize::Size32x2 => ScalarSize::Size32,
+            VectorSize::Size32x4 => ScalarSize::Size32,
+            VectorSize::Size64x2 => ScalarSize::Size64,
+        }
+    }
+}
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 9fc952f644..f12205dbd4 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1007,7 +1007,7 @@ impl MachInstEmit for Inst {
                 sink.put4(enc_vecmov(/* 16b = */ true, rd, rn));
             }
             &Inst::FpuMoveFromVec { rd, rn, idx, size } => {
-                let (imm5, shift, mask) = match size {
+                let (imm5, shift, mask) = match size.lane_size() {
                     ScalarSize::Size32 => (0b00100, 3, 0b011),
                     ScalarSize::Size64 => (0b01000, 4, 0b001),
                     _ => unimplemented!(),
@@ -1048,6 +1048,10 @@ impl MachInstEmit for Inst {
                     FPUOp2::Max64 => 0b000_11110_01_1_00000_010010,
                     FPUOp2::Min32 => 0b000_11110_00_1_00000_010110,
                     FPUOp2::Min64 => 0b000_11110_01_1_00000_010110,
+                    FPUOp2::Sqadd64 => 0b010_11110_11_1_00000_000011,
+                    FPUOp2::Uqadd64 => 0b011_11110_11_1_00000_000011,
+                    FPUOp2::Sqsub64 => 0b010_11110_11_1_00000_001011,
+                    FPUOp2::Uqsub64 => 0b011_11110_11_1_00000_001011,
                 };
                 sink.put4(enc_fpurrr(top22, rd, rn, rm));
             }
@@ -1102,31 +1106,25 @@ impl MachInstEmit for Inst {
                 };
                 sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
             }
-            &Inst::VecMisc { op, rd, rn, ty } => {
-                let enc_size = match ty {
-                    I8X16 => 0b00,
-                    I16X8 => 0b01,
-                    I32X4 => 0b10,
-                    I64X2 => 0b11,
-                    _ => 0,
+            &Inst::VecMisc { op, rd, rn, size } => {
+                let enc_size = match size {
+                    VectorSize::Size8x16 => 0b00,
+                    VectorSize::Size16x8 => 0b01,
+                    VectorSize::Size32x4 => 0b10,
+                    VectorSize::Size64x2 => 0b11,
+                    _ => unimplemented!(),
                 };
                 let (bits_12_16, size) = match op {
-                    VecMisc2::Not => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b00101, 0b00)
-                    }
-                    VecMisc2::Neg => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b01011, enc_size)
-                    }
+                    VecMisc2::Not => (0b00101, 0b00),
+                    VecMisc2::Neg => (0b01011, enc_size),
                 };
                 sink.put4(enc_vec_rr_misc(size, bits_12_16, rd, rn));
             }
-            &Inst::VecLanes { op, rd, rn, ty } => {
-                let (q, size) = match ty {
-                    I8X16 => (0b1, 0b00),
-                    I16X8 => (0b1, 0b01),
-                    I32X4 => (0b1, 0b10),
+            &Inst::VecLanes { op, rd, rn, size } => {
+                let (q, size) = match size {
+                    VectorSize::Size8x16 => (0b1, 0b00),
+                    VectorSize::Size16x8 => (0b1, 0b01),
+                    VectorSize::Size32x4 => (0b1, 0b10),
                     _ => unreachable!(),
                 };
                 let (u, opcode) = match op {
@@ -1250,12 +1248,12 @@ impl MachInstEmit for Inst {
                         | machreg_to_vec(rd.to_reg()),
                 );
             }
-            &Inst::MovFromVec { rd, rn, idx, ty } => {
-                let (q, imm5, shift, mask) = match ty {
-                    I8 => (0b0, 0b00001, 1, 0b1111),
-                    I16 => (0b0, 0b00010, 2, 0b0111),
-                    I32 => (0b0, 0b00100, 3, 0b0011),
-                    I64 => (0b1, 0b01000, 4, 0b0001),
+            &Inst::MovFromVec { rd, rn, idx, size } => {
+                let (q, imm5, shift, mask) = match size {
+                    VectorSize::Size8x16 => (0b0, 0b00001, 1, 0b1111),
+                    VectorSize::Size16x8 => (0b0, 0b00010, 2, 0b0111),
+                    VectorSize::Size32x4 => (0b0, 0b00100, 3, 0b0011),
+                    VectorSize::Size64x2 => (0b1, 0b01000, 4, 0b0001),
                     _ => unreachable!(),
                 };
                 debug_assert_eq!(idx & mask, idx);
@@ -1268,12 +1266,12 @@ impl MachInstEmit for Inst {
                         | machreg_to_gpr(rd.to_reg()),
                 );
             }
-            &Inst::VecDup { rd, rn, ty } => {
-                let imm5 = match ty {
-                    I8 => 0b00001,
-                    I16 => 0b00010,
-                    I32 => 0b00100,
-                    I64 => 0b01000,
+            &Inst::VecDup { rd, rn, size } => {
+                let imm5 = match size {
+                    VectorSize::Size8x16 => 0b00001,
+                    VectorSize::Size16x8 => 0b00010,
+                    VectorSize::Size32x4 => 0b00100,
+                    VectorSize::Size64x2 => 0b01000,
                     _ => unimplemented!(),
                 };
                 sink.put4(
@@ -1283,10 +1281,10 @@ impl MachInstEmit for Inst {
                         | machreg_to_vec(rd.to_reg()),
                 );
             }
-            &Inst::VecDupFromFpu { rd, rn, ty } => {
-                let imm5 = match ty {
-                    F32 => 0b00100,
-                    F64 => 0b01000,
+            &Inst::VecDupFromFpu { rd, rn, size } => {
+                let imm5 = match size {
+                    VectorSize::Size32x4 => 0b00100,
+                    VectorSize::Size64x2 => 0b01000,
                     _ => unimplemented!(),
                 };
                 sink.put4(
@@ -1318,41 +1316,25 @@ impl MachInstEmit for Inst {
                 rn,
                 rm,
                 alu_op,
-                ty,
+                size,
             } => {
-                let enc_size = match ty {
-                    I8X16 => 0b00,
-                    I16X8 => 0b01,
-                    I32X4 => 0b10,
-                    I64X2 => 0b11,
+                let enc_size = match size {
+                    VectorSize::Size8x16 => 0b00,
+                    VectorSize::Size16x8 => 0b01,
+                    VectorSize::Size32x4 => 0b10,
+                    VectorSize::Size64x2 => 0b11,
                     _ => 0,
                 };
-                let enc_size_for_fcmp = match ty {
-                    F32X4 => 0b0,
-                    F64X2 => 0b1,
+                let enc_size_for_fcmp = match size {
+                    VectorSize::Size32x4 => 0b0,
+                    VectorSize::Size64x2 => 0b1,
                     _ => 0,
                 };
 
                 let (top11, bit15_10) = match alu_op {
-                    VecALUOp::SQAddScalar => {
-                        debug_assert_eq!(I64, ty);
-                        (0b010_11110_11_1, 0b000011)
-                    }
                     VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::SQSubScalar => {
-                        debug_assert_eq!(I64, ty);
-                        (0b010_11110_11_1, 0b001011)
-                    }
                     VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
-                    VecALUOp::UQAddScalar => {
-                        debug_assert_eq!(I64, ty);
-                        (0b011_11110_11_1, 0b000011)
-                    }
                     VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::UQSubScalar => {
-                        debug_assert_eq!(I64, ty);
-                        (0b011_11110_11_1, 0b001011)
-                    }
                     VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
                     VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
                     VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
@@ -1364,31 +1346,16 @@ impl MachInstEmit for Inst {
                     VecALUOp::Fcmge => (0b011_01110_00_1 | enc_size_for_fcmp << 1, 0b111001),
                     // The following logical instructions operate on bytes, so are not encoded differently
                     // for the different vector types.
-                    VecALUOp::And => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b010_01110_00_1, 0b000111)
-                    }
-                    VecALUOp::Bic => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b010_01110_01_1, 0b000111)
-                    }
-                    VecALUOp::Orr => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b010_01110_10_1, 0b000111)
-                    }
-                    VecALUOp::Eor => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b011_01110_00_1, 0b000111)
-                    }
-                    VecALUOp::Bsl => {
-                        debug_assert_eq!(128, ty_bits(ty));
-                        (0b011_01110_01_1, 0b000111)
-                    }
+                    VecALUOp::And => (0b010_01110_00_1, 0b000111),
+                    VecALUOp::Bic => (0b010_01110_01_1, 0b000111),
+                    VecALUOp::Orr => (0b010_01110_10_1, 0b000111),
+                    VecALUOp::Eor => (0b011_01110_00_1, 0b000111),
+                    VecALUOp::Bsl => (0b011_01110_01_1, 0b000111),
                     VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
                     VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
                     VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
                     VecALUOp::Mul => {
-                        debug_assert_ne!(I64X2, ty);
+                        debug_assert_ne!(size, VectorSize::Size64x2);
                         (0b010_01110_00_1 | enc_size << 1, 0b100111)
                     }
                     VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index e8148dbe41..29e3036e16 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1841,7 +1841,7 @@ fn test_aarch64_binemit() {
             rd: writable_xreg(3),
             rn: vreg(27),
             idx: 14,
-            ty: I8,
+            size: VectorSize::Size8x16,
         },
         "633F1D0E",
         "umov w3, v27.b[14]",
@@ -1851,7 +1851,7 @@ fn test_aarch64_binemit() {
             rd: writable_xreg(24),
             rn: vreg(5),
             idx: 3,
-            ty: I16,
+            size: VectorSize::Size16x8,
         },
         "B83C0E0E",
         "umov w24, v5.h[3]",
@@ -1861,7 +1861,7 @@ fn test_aarch64_binemit() {
             rd: writable_xreg(12),
             rn: vreg(17),
             idx: 1,
-            ty: I32,
+            size: VectorSize::Size32x4,
         },
         "2C3E0C0E",
         "mov w12, v17.s[1]",
@@ -1871,7 +1871,7 @@ fn test_aarch64_binemit() {
             rd: writable_xreg(21),
             rn: vreg(20),
             idx: 0,
-            ty: I64,
+            size: VectorSize::Size64x2,
         },
         "953E084E",
         "mov x21, v20.d[0]",
@@ -1900,7 +1900,7 @@ fn test_aarch64_binemit() {
         Inst::VecDup {
             rd: writable_vreg(25),
             rn: xreg(7),
-            ty: I8,
+            size: VectorSize::Size8x16,
         },
         "F90C014E",
         "dup v25.16b, w7",
@@ -1909,7 +1909,7 @@ fn test_aarch64_binemit() {
         Inst::VecDup {
             rd: writable_vreg(2),
             rn: xreg(23),
-            ty: I16,
+            size: VectorSize::Size16x8,
         },
         "E20E024E",
         "dup v2.8h, w23",
@@ -1918,7 +1918,7 @@ fn test_aarch64_binemit() {
         Inst::VecDup {
             rd: writable_vreg(0),
             rn: xreg(28),
-            ty: I32,
+            size: VectorSize::Size32x4,
         },
         "800F044E",
         "dup v0.4s, w28",
@@ -1927,7 +1927,7 @@ fn test_aarch64_binemit() {
         Inst::VecDup {
             rd: writable_vreg(31),
             rn: xreg(5),
-            ty: I64,
+            size: VectorSize::Size64x2,
         },
         "BF0C084E",
         "dup v31.2d, x5",
@@ -1936,7 +1936,7 @@ fn test_aarch64_binemit() {
         Inst::VecDupFromFpu {
             rd: writable_vreg(14),
             rn: vreg(19),
-            ty: F32,
+            size: VectorSize::Size32x4,
         },
         "6E06044E",
         "dup v14.4s, v19.s[0]",
@@ -1945,7 +1945,7 @@ fn test_aarch64_binemit() {
         Inst::VecDupFromFpu {
             rd: writable_vreg(18),
             rn: vreg(10),
-            ty: F64,
+            size: VectorSize::Size64x2,
         },
         "5205084E",
         "dup v18.2d, v10.d[0]",
@@ -2004,50 +2004,6 @@ fn test_aarch64_binemit() {
         "5CA4202F",
         "uxtl v28.2d, v2.2s",
     ));
-    insns.push((
-        Inst::VecRRR {
-            rd: writable_vreg(21),
-            rn: vreg(22),
-            rm: vreg(23),
-            alu_op: VecALUOp::UQAddScalar,
-            ty: I64,
-        },
-        "D50EF77E",
-        "uqadd d21, d22, d23",
-    ));
-    insns.push((
-        Inst::VecRRR {
-            rd: writable_vreg(21),
-            rn: vreg(22),
-            rm: vreg(23),
-            alu_op: VecALUOp::SQAddScalar,
-            ty: I64,
-        },
-        "D50EF75E",
-        "sqadd d21, d22, d23",
-    ));
-    insns.push((
-        Inst::VecRRR {
-            rd: writable_vreg(21),
-            rn: vreg(22),
-            rm: vreg(23),
-            alu_op: VecALUOp::UQSubScalar,
-            ty: I64,
-        },
-        "D52EF77E",
-        "uqsub d21, d22, d23",
-    ));
-    insns.push((
-        Inst::VecRRR {
-            rd: writable_vreg(21),
-            rn: vreg(22),
-            rm: vreg(23),
-            alu_op: VecALUOp::SQSubScalar,
-            ty: I64,
-        },
-        "D52EF75E",
-        "sqsub d21, d22, d23",
-    ));
 
     insns.push((
         Inst::VecRRR {
@@ -2055,7 +2011,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(2),
             rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "410C284E",
         "sqadd v1.16b, v2.16b, v8.16b",
@@ -2067,7 +2023,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(12),
             rm: vreg(28),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "810D7C4E",
         "sqadd v1.8h, v12.8h, v28.8h",
@@ -2079,7 +2035,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(12),
             rn: vreg(2),
             rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "4C0CA64E",
         "sqadd v12.4s, v2.4s, v6.4s",
@@ -2091,7 +2047,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(20),
             rn: vreg(7),
             rm: vreg(13),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
         },
         "F40CED4E",
         "sqadd v20.2d, v7.2d, v13.2d",
@@ -2103,7 +2059,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(2),
             rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "412C284E",
         "sqsub v1.16b, v2.16b, v8.16b",
@@ -2115,7 +2071,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(12),
             rm: vreg(28),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "812D7C4E",
         "sqsub v1.8h, v12.8h, v28.8h",
@@ -2127,7 +2083,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(12),
             rn: vreg(2),
             rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "4C2CA64E",
         "sqsub v12.4s, v2.4s, v6.4s",
@@ -2139,7 +2095,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(20),
             rn: vreg(7),
             rm: vreg(13),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
         },
         "F42CED4E",
         "sqsub v20.2d, v7.2d, v13.2d",
@@ -2151,7 +2107,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(2),
             rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "410C286E",
         "uqadd v1.16b, v2.16b, v8.16b",
@@ -2163,7 +2119,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(12),
             rm: vreg(28),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "810D7C6E",
         "uqadd v1.8h, v12.8h, v28.8h",
@@ -2175,7 +2131,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(12),
             rn: vreg(2),
             rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "4C0CA66E",
         "uqadd v12.4s, v2.4s, v6.4s",
@@ -2187,7 +2143,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(20),
             rn: vreg(7),
             rm: vreg(13),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
         },
         "F40CED6E",
         "uqadd v20.2d, v7.2d, v13.2d",
@@ -2199,7 +2155,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(2),
             rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "412C286E",
         "uqsub v1.16b, v2.16b, v8.16b",
@@ -2211,7 +2167,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(12),
             rm: vreg(28),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "812D7C6E",
         "uqsub v1.8h, v12.8h, v28.8h",
@@ -2223,7 +2179,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(12),
             rn: vreg(2),
             rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "4C2CA66E",
         "uqsub v12.4s, v2.4s, v6.4s",
@@ -2235,7 +2191,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(20),
             rn: vreg(7),
             rm: vreg(13),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
         },
         "F42CED6E",
         "uqsub v20.2d, v7.2d, v13.2d",
@@ -2247,7 +2203,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(3),
             rn: vreg(23),
             rm: vreg(24),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "E38E386E",
         "cmeq v3.16b, v23.16b, v24.16b",
@@ -2259,7 +2215,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(3),
             rn: vreg(23),
             rm: vreg(24),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "E336384E",
         "cmgt v3.16b, v23.16b, v24.16b",
@@ -2271,7 +2227,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(23),
             rn: vreg(9),
             rm: vreg(12),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "373D2C4E",
         "cmge v23.16b, v9.16b, v12.16b",
@@ -2283,7 +2239,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(5),
             rn: vreg(1),
             rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "2534216E",
         "cmhi v5.16b, v1.16b, v1.16b",
@@ -2295,7 +2251,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(8),
             rn: vreg(2),
             rm: vreg(15),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "483C2F6E",
         "cmhs v8.16b, v2.16b, v15.16b",
@@ -2307,7 +2263,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(3),
             rn: vreg(23),
             rm: vreg(24),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "E38E786E",
         "cmeq v3.8h, v23.8h, v24.8h",
@@ -2319,7 +2275,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(3),
             rn: vreg(23),
             rm: vreg(24),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "E336784E",
         "cmgt v3.8h, v23.8h, v24.8h",
@@ -2331,7 +2287,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(23),
             rn: vreg(9),
             rm: vreg(12),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "373D6C4E",
         "cmge v23.8h, v9.8h, v12.8h",
@@ -2343,7 +2299,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(5),
             rn: vreg(1),
             rm: vreg(1),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "2534616E",
         "cmhi v5.8h, v1.8h, v1.8h",
@@ -2355,7 +2311,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(8),
             rn: vreg(2),
             rm: vreg(15),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "483C6F6E",
         "cmhs v8.8h, v2.8h, v15.8h",
@@ -2367,7 +2323,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(3),
             rn: vreg(23),
             rm: vreg(24),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "E38EB86E",
         "cmeq v3.4s, v23.4s, v24.4s",
@@ -2379,7 +2335,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(3),
             rn: vreg(23),
             rm: vreg(24),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "E336B84E",
         "cmgt v3.4s, v23.4s, v24.4s",
@@ -2391,7 +2347,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(23),
             rn: vreg(9),
             rm: vreg(12),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "373DAC4E",
         "cmge v23.4s, v9.4s, v12.4s",
@@ -2403,7 +2359,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(5),
             rn: vreg(1),
             rm: vreg(1),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "2534A16E",
         "cmhi v5.4s, v1.4s, v1.4s",
@@ -2415,7 +2371,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(8),
             rn: vreg(2),
             rm: vreg(15),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "483CAF6E",
         "cmhs v8.4s, v2.4s, v15.4s",
@@ -2427,7 +2383,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(28),
             rn: vreg(12),
             rm: vreg(4),
-            ty: F32X4,
+            size: VectorSize::Size32x4,
         },
         "9CE5244E",
         "fcmeq v28.4s, v12.4s, v4.4s",
@@ -2439,7 +2395,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(3),
             rn: vreg(16),
             rm: vreg(31),
-            ty: F64X2,
+            size: VectorSize::Size64x2,
         },
         "03E6FF6E",
         "fcmgt v3.2d, v16.2d, v31.2d",
@@ -2451,7 +2407,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(23),
             rm: vreg(0),
-            ty: F64X2,
+            size: VectorSize::Size64x2,
         },
         "F2E6606E",
         "fcmge v18.2d, v23.2d, v0.2d",
@@ -2463,7 +2419,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(20),
             rn: vreg(19),
             rm: vreg(18),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "741E324E",
         "and v20.16b, v19.16b, v18.16b",
@@ -2475,7 +2431,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(8),
             rn: vreg(11),
             rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "681D614E",
         "bic v8.16b, v11.16b, v1.16b",
@@ -2487,7 +2443,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(15),
             rn: vreg(2),
             rm: vreg(12),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "4F1CAC4E",
         "orr v15.16b, v2.16b, v12.16b",
@@ -2499,7 +2455,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(3),
             rm: vreg(22),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "721C366E",
         "eor v18.16b, v3.16b, v22.16b",
@@ -2511,7 +2467,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(8),
             rn: vreg(9),
             rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "281D616E",
         "bsl v8.16b, v9.16b, v1.16b",
@@ -2523,7 +2479,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(8),
             rn: vreg(12),
             rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "88A5216E",
         "umaxp v8.16b, v12.16b, v1.16b",
@@ -2535,7 +2491,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(6),
             rm: vreg(1),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "C1A4616E",
         "umaxp v1.8h, v6.8h, v1.8h",
@@ -2547,7 +2503,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(20),
             rm: vreg(16),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "81A6B06E",
         "umaxp v1.4s, v20.4s, v16.4s",
@@ -2559,7 +2515,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(5),
             rn: vreg(1),
             rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "2584214E",
         "add v5.16b, v1.16b, v1.16b",
@@ -2571,7 +2527,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(7),
             rn: vreg(13),
             rm: vreg(2),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "A785624E",
         "add v7.8h, v13.8h, v2.8h",
@@ -2583,7 +2539,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(9),
             rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "3285A64E",
         "add v18.4s, v9.4s, v6.4s",
@@ -2595,7 +2551,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(3),
             rm: vreg(2),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
         },
         "6184E24E",
         "add v1.2d, v3.2d, v2.2d",
@@ -2607,7 +2563,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(5),
             rn: vreg(1),
             rm: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "2584216E",
         "sub v5.16b, v1.16b, v1.16b",
@@ -2619,7 +2575,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(7),
             rn: vreg(13),
             rm: vreg(2),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "A785626E",
         "sub v7.8h, v13.8h, v2.8h",
@@ -2631,7 +2587,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(9),
             rm: vreg(6),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "3285A66E",
         "sub v18.4s, v9.4s, v6.4s",
@@ -2643,7 +2599,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(0),
             rm: vreg(8),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
         },
         "1284E86E",
         "sub v18.2d, v0.2d, v8.2d",
@@ -2655,7 +2611,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(25),
             rn: vreg(9),
             rm: vreg(8),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "399D284E",
         "mul v25.16b, v9.16b, v8.16b",
@@ -2667,7 +2623,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(30),
             rn: vreg(30),
             rm: vreg(12),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "DE9F6C4E",
         "mul v30.8h, v30.8h, v12.8h",
@@ -2679,7 +2635,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(18),
             rm: vreg(18),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "529EB24E",
         "mul v18.4s, v18.4s, v18.4s",
@@ -2691,7 +2647,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(18),
             rm: vreg(18),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "5246326E",
         "ushl v18.16b, v18.16b, v18.16b",
@@ -2703,7 +2659,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(18),
             rm: vreg(18),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "5246726E",
         "ushl v18.8h, v18.8h, v18.8h",
@@ -2715,7 +2671,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(1),
             rm: vreg(21),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "3244B56E",
         "ushl v18.4s, v1.4s, v21.4s",
@@ -2727,7 +2683,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(5),
             rn: vreg(7),
             rm: vreg(19),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
         },
         "E544F36E",
         "ushl v5.2d, v7.2d, v19.2d",
@@ -2739,7 +2695,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(18),
             rn: vreg(18),
             rm: vreg(18),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "5246324E",
         "sshl v18.16b, v18.16b, v18.16b",
@@ -2751,7 +2707,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(30),
             rn: vreg(1),
             rm: vreg(29),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "3E447D4E",
         "sshl v30.8h, v1.8h, v29.8h",
@@ -2763,7 +2719,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(8),
             rn: vreg(22),
             rm: vreg(21),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "C846B54E",
         "sshl v8.4s, v22.4s, v21.4s",
@@ -2775,7 +2731,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(8),
             rn: vreg(22),
             rm: vreg(2),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
         },
         "C846E24E",
         "sshl v8.2d, v22.2d, v2.2d",
@@ -2786,7 +2742,7 @@ fn test_aarch64_binemit() {
             op: VecMisc2::Not,
             rd: writable_vreg(2),
             rn: vreg(1),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "2258206E",
         "mvn v2.16b, v1.16b",
@@ -2797,7 +2753,7 @@ fn test_aarch64_binemit() {
             op: VecMisc2::Neg,
             rd: writable_vreg(8),
             rn: vreg(12),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "88B9206E",
         "neg v8.16b, v12.16b",
@@ -2808,7 +2764,7 @@ fn test_aarch64_binemit() {
             op: VecMisc2::Neg,
             rd: writable_vreg(0),
             rn: vreg(31),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "E0BB606E",
         "neg v0.8h, v31.8h",
@@ -2819,7 +2775,7 @@ fn test_aarch64_binemit() {
             op: VecMisc2::Neg,
             rd: writable_vreg(2),
             rn: vreg(3),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "62B8A06E",
         "neg v2.4s, v3.4s",
@@ -2830,7 +2786,7 @@ fn test_aarch64_binemit() {
             op: VecMisc2::Neg,
             rd: writable_vreg(10),
             rn: vreg(8),
-            ty: I64X2,
+            size: VectorSize::Size64x2,
         },
         "0AB9E06E",
         "neg v10.2d, v8.2d",
@@ -2841,7 +2797,7 @@ fn test_aarch64_binemit() {
             op: VecLanesOp::Uminv,
             rd: writable_vreg(2),
             rn: vreg(1),
-            ty: I8X16,
+            size: VectorSize::Size8x16,
         },
         "22A8316E",
         "uminv b2, v1.16b",
@@ -2852,7 +2808,7 @@ fn test_aarch64_binemit() {
             op: VecLanesOp::Uminv,
             rd: writable_vreg(3),
             rn: vreg(11),
-            ty: I16X8,
+            size: VectorSize::Size16x8,
         },
         "63A9716E",
         "uminv h3, v11.8h",
@@ -2863,7 +2819,7 @@ fn test_aarch64_binemit() {
             op: VecLanesOp::Uminv,
             rd: writable_vreg(18),
             rn: vreg(4),
-            ty: I32X4,
+            size: VectorSize::Size32x4,
         },
         "92A8B16E",
         "uminv s18, v4.4s",
@@ -3214,7 +3170,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(1),
             rn: vreg(30),
             idx: 2,
-            size: ScalarSize::Size32,
+            size: VectorSize::Size32x4,
         },
         "C107145E",
         "mov s1, v30.s[2]",
@@ -3225,7 +3181,7 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(23),
             rn: vreg(11),
             idx: 0,
-            size: ScalarSize::Size64,
+            size: VectorSize::Size64x2,
         },
         "7705085E",
         "mov d23, v11.d[0]",
@@ -3443,6 +3399,50 @@ fn test_aarch64_binemit() {
         "fmin d15, d30, d31",
     ));
 
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Uqadd64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D50EF77E",
+        "uqadd d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Sqadd64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D50EF75E",
+        "sqadd d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Uqsub64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D52EF77E",
+        "uqsub d21, d22, d23",
+    ));
+
+    insns.push((
+        Inst::FpuRRR {
+            fpu_op: FPUOp2::Sqsub64,
+            rd: writable_vreg(21),
+            rn: vreg(22),
+            rm: vreg(23),
+        },
+        "D52EF75E",
+        "sqsub d21, d22, d23",
+    ));
+
     insns.push((
         Inst::FpuRRRR {
             fpu_op: FPUOp3::MAdd32,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 79a72c245c..1c5c6f9a1c 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -5,8 +5,8 @@
 
 use crate::binemit::CodeOffset;
 use crate::ir::types::{
-    B1, B16, B16X8, B32, B32X4, B64, B64X2, B8, B8X16, F32, F32X2, F32X4, F64, F64X2, FFLAGS, I16,
-    I16X4, I16X8, I32, I32X2, I32X4, I64, I64X2, I8, I8X16, I8X8, IFLAGS, R32, R64,
+    B1, B16, B16X8, B32, B32X4, B64, B64X2, B8, B8X16, F32, F32X4, F64, F64X2, FFLAGS, I16, I16X8,
+    I32, I32X4, I64, I64X2, I8, I8X16, IFLAGS, R32, R64,
 };
 use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
 use crate::machinst::*;
@@ -125,6 +125,14 @@ pub enum FPUOp2 {
     Max64,
     Min32,
     Min64,
+    /// Signed saturating add
+    Sqadd64,
+    /// Unsigned saturating add
+    Uqadd64,
+    /// Signed saturating subtract
+    Sqsub64,
+    /// Unsigned saturating subtract
+    Uqsub64,
 }
 
 /// A floating-point unit (FPU) operation with two args, a register and an immediate.
@@ -208,16 +216,12 @@ pub enum VecExtendOp {
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum VecALUOp {
     /// Signed saturating add
-    SQAddScalar,
     Sqadd,
     /// Unsigned saturating add
-    UQAddScalar,
     Uqadd,
     /// Signed saturating subtract
-    SQSubScalar,
     Sqsub,
     /// Unsigned saturating subtract
-    UQSubScalar,
     Uqsub,
     /// Compare bitwise equal
     Cmeq,
@@ -590,7 +594,7 @@ pub enum Inst {
         rd: Writable<Reg>,
         rn: Reg,
         idx: u8,
-        size: ScalarSize,
+        size: VectorSize,
     },
 
     /// 1-op FPU instruction.
@@ -734,21 +738,21 @@ pub enum Inst {
         rd: Writable<Reg>,
         rn: Reg,
         idx: u8,
-        ty: Type,
+        size: VectorSize,
     },
 
     /// Duplicate general-purpose register to vector.
     VecDup {
         rd: Writable<Reg>,
         rn: Reg,
-        ty: Type,
+        size: VectorSize,
     },
 
     /// Duplicate scalar to vector.
     VecDupFromFpu {
         rd: Writable<Reg>,
         rn: Reg,
-        ty: Type,
+        size: VectorSize,
     },
 
     /// Vector extend.
@@ -764,7 +768,7 @@ pub enum Inst {
         rd: Writable<Reg>,
         rn: Reg,
         rm: Reg,
-        ty: Type,
+        size: VectorSize,
     },
 
     /// Vector two register miscellaneous instruction.
@@ -772,7 +776,7 @@ pub enum Inst {
         op: VecMisc2,
         rd: Writable<Reg>,
         rn: Reg,
-        ty: Type,
+        size: VectorSize,
     },
 
     /// Vector instruction across lanes.
@@ -780,7 +784,7 @@ pub enum Inst {
         op: VecLanesOp,
         rd: Writable<Reg>,
         rn: Reg,
-        ty: Type,
+        size: VectorSize,
     },
 
     /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
@@ -2504,13 +2508,8 @@ impl Inst {
                 format!("mov {}.16b, {}.16b", rd, rn)
             }
             &Inst::FpuMoveFromVec { rd, rn, idx, size } => {
-                let vector_type = match size {
-                    ScalarSize::Size32 => F32,
-                    ScalarSize::Size64 => F64,
-                    _ => unimplemented!(),
-                };
-                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
-                let rn = show_vreg_element(rn, mb_rru, idx, vector_type);
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size.lane_size());
+                let rn = show_vreg_element(rn, mb_rru, idx, size);
                 format!("mov {}, {}", rd, rn)
             }
             &Inst::FpuRR { fpu_op, rd, rn } => {
@@ -2542,6 +2541,10 @@ impl Inst {
                     FPUOp2::Max64 => ("fmax", ScalarSize::Size64),
                     FPUOp2::Min32 => ("fmin", ScalarSize::Size32),
                     FPUOp2::Min64 => ("fmin", ScalarSize::Size64),
+                    FPUOp2::Sqadd64 => ("sqadd", ScalarSize::Size64),
+                    FPUOp2::Uqadd64 => ("uqadd", ScalarSize::Size64),
+                    FPUOp2::Sqsub64 => ("sqsub", ScalarSize::Size64),
+                    FPUOp2::Uqsub64 => ("uqsub", ScalarSize::Size64),
                 };
                 let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
                 let rn = show_vreg_scalar(rn, mb_rru, size);
@@ -2557,7 +2560,7 @@ impl Inst {
                 };
 
                 let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>) -> String = if vector {
-                    |reg, mb_rru| show_vreg_vector(reg, mb_rru, F32X2)
+                    |reg, mb_rru| show_vreg_vector(reg, mb_rru, VectorSize::Size32x2)
                 } else {
                     |reg, mb_rru| show_vreg_scalar(reg, mb_rru, ScalarSize::Size64)
                 };
@@ -2706,45 +2709,36 @@ impl Inst {
                 let rn = rn.show_rru(mb_rru);
                 format!("mov {}.d[0], {}", rd, rn)
             }
-            &Inst::MovFromVec { rd, rn, idx, ty } => {
-                let op = match ty {
-                    I32 | I64 => "mov",
-                    _ => "umov",
+            &Inst::MovFromVec { rd, rn, idx, size } => {
+                let op = match size {
+                    VectorSize::Size8x16 => "umov",
+                    VectorSize::Size16x8 => "umov",
+                    VectorSize::Size32x4 => "mov",
+                    VectorSize::Size64x2 => "mov",
+                    _ => unimplemented!(),
                 };
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, OperandSize::from_ty(ty));
-                let rn = show_vreg_element(rn, mb_rru, idx, ty);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size.operand_size());
+                let rn = show_vreg_element(rn, mb_rru, idx, size);
                 format!("{} {}, {}", op, rd, rn)
             }
-            &Inst::VecDup { rd, rn, ty } => {
-                let vector_type = match ty {
-                    I8 => I8X16,
-                    I16 => I16X8,
-                    I32 => I32X4,
-                    I64 => I64X2,
-                    _ => unimplemented!(),
-                };
-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, vector_type);
-                let rn = show_ireg_sized(rn, mb_rru, OperandSize::from_ty(ty));
+            &Inst::VecDup { rd, rn, size } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size.operand_size());
                 format!("dup {}, {}", rd, rn)
             }
-            &Inst::VecDupFromFpu { rd, rn, ty } => {
-                let vector_type = match ty {
-                    F32 => F32X4,
-                    F64 => F64X2,
-                    _ => unimplemented!(),
-                };
-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, vector_type);
-                let rn = show_vreg_element(rn, mb_rru, 0, ty);
+            &Inst::VecDupFromFpu { rd, rn, size } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_element(rn, mb_rru, 0, size);
                 format!("dup {}, {}", rd, rn)
             }
             &Inst::VecExtend { t, rd, rn } => {
                 let (op, dest, src) = match t {
-                    VecExtendOp::Sxtl8 => ("sxtl", I16X8, I8X8),
-                    VecExtendOp::Sxtl16 => ("sxtl", I32X4, I16X4),
-                    VecExtendOp::Sxtl32 => ("sxtl", I64X2, I32X2),
-                    VecExtendOp::Uxtl8 => ("uxtl", I16X8, I8X8),
-                    VecExtendOp::Uxtl16 => ("uxtl", I32X4, I16X4),
-                    VecExtendOp::Uxtl32 => ("uxtl", I64X2, I32X2),
+                    VecExtendOp::Sxtl8 => ("sxtl", VectorSize::Size16x8, VectorSize::Size8x8),
+                    VecExtendOp::Sxtl16 => ("sxtl", VectorSize::Size32x4, VectorSize::Size16x4),
+                    VecExtendOp::Sxtl32 => ("sxtl", VectorSize::Size64x2, VectorSize::Size32x2),
+                    VecExtendOp::Uxtl8 => ("uxtl", VectorSize::Size16x8, VectorSize::Size8x8),
+                    VecExtendOp::Uxtl16 => ("uxtl", VectorSize::Size32x4, VectorSize::Size16x4),
+                    VecExtendOp::Uxtl32 => ("uxtl", VectorSize::Size64x2, VectorSize::Size32x2),
                 };
                 let rd = show_vreg_vector(rd.to_reg(), mb_rru, dest);
                 let rn = show_vreg_vector(rn, mb_rru, src);
@@ -2755,72 +2749,54 @@ impl Inst {
                 rn,
                 rm,
                 alu_op,
-                ty,
+                size,
             } => {
-                let (op, vector, ty) = match alu_op {
-                    VecALUOp::SQAddScalar => ("sqadd", false, ty),
-                    VecALUOp::Sqadd => ("sqadd", true, ty),
-                    VecALUOp::UQAddScalar => ("uqadd", false, ty),
-                    VecALUOp::Uqadd => ("uqadd", true, ty),
-                    VecALUOp::SQSubScalar => ("sqsub", false, ty),
-                    VecALUOp::Sqsub => ("sqsub", true, ty),
-                    VecALUOp::UQSubScalar => ("uqsub", false, ty),
-                    VecALUOp::Uqsub => ("uqsub", true, ty),
-                    VecALUOp::Cmeq => ("cmeq", true, ty),
-                    VecALUOp::Cmge => ("cmge", true, ty),
-                    VecALUOp::Cmgt => ("cmgt", true, ty),
-                    VecALUOp::Cmhs => ("cmhs", true, ty),
-                    VecALUOp::Cmhi => ("cmhi", true, ty),
-                    VecALUOp::Fcmeq => ("fcmeq", true, ty),
-                    VecALUOp::Fcmgt => ("fcmgt", true, ty),
-                    VecALUOp::Fcmge => ("fcmge", true, ty),
-                    VecALUOp::And => ("and", true, I8X16),
-                    VecALUOp::Bic => ("bic", true, I8X16),
-                    VecALUOp::Orr => ("orr", true, I8X16),
-                    VecALUOp::Eor => ("eor", true, I8X16),
-                    VecALUOp::Bsl => ("bsl", true, I8X16),
-                    VecALUOp::Umaxp => ("umaxp", true, ty),
-                    VecALUOp::Add => ("add", true, ty),
-                    VecALUOp::Sub => ("sub", true, ty),
-                    VecALUOp::Mul => ("mul", true, ty),
-                    VecALUOp::Sshl => ("sshl", true, ty),
-                    VecALUOp::Ushl => ("ushl", true, ty),
+                let (op, size) = match alu_op {
+                    VecALUOp::Sqadd => ("sqadd", size),
+                    VecALUOp::Uqadd => ("uqadd", size),
+                    VecALUOp::Sqsub => ("sqsub", size),
+                    VecALUOp::Uqsub => ("uqsub", size),
+                    VecALUOp::Cmeq => ("cmeq", size),
+                    VecALUOp::Cmge => ("cmge", size),
+                    VecALUOp::Cmgt => ("cmgt", size),
+                    VecALUOp::Cmhs => ("cmhs", size),
+                    VecALUOp::Cmhi => ("cmhi", size),
+                    VecALUOp::Fcmeq => ("fcmeq", size),
+                    VecALUOp::Fcmgt => ("fcmgt", size),
+                    VecALUOp::Fcmge => ("fcmge", size),
+                    VecALUOp::And => ("and", VectorSize::Size8x16),
+                    VecALUOp::Bic => ("bic", VectorSize::Size8x16),
+                    VecALUOp::Orr => ("orr", VectorSize::Size8x16),
+                    VecALUOp::Eor => ("eor", VectorSize::Size8x16),
+                    VecALUOp::Bsl => ("bsl", VectorSize::Size8x16),
+                    VecALUOp::Umaxp => ("umaxp", size),
+                    VecALUOp::Add => ("add", size),
+                    VecALUOp::Sub => ("sub", size),
+                    VecALUOp::Mul => ("mul", size),
+                    VecALUOp::Sshl => ("sshl", size),
+                    VecALUOp::Ushl => ("ushl", size),
                 };
-
-                let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector {
-                    |reg, mb_rru, ty| show_vreg_vector(reg, mb_rru, ty)
-                } else {
-                    |reg, mb_rru, _ty| show_vreg_scalar(reg, mb_rru, ScalarSize::Size64)
-                };
-
-                let rd = show_vreg_fn(rd.to_reg(), mb_rru, ty);
-                let rn = show_vreg_fn(rn, mb_rru, ty);
-                let rm = show_vreg_fn(rm, mb_rru, ty);
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_vector(rn, mb_rru, size);
+                let rm = show_vreg_vector(rm, mb_rru, size);
                 format!("{} {}, {}, {}", op, rd, rn, rm)
             }
-            &Inst::VecMisc { op, rd, rn, ty } => {
-                let (op, ty) = match op {
-                    VecMisc2::Not => ("mvn", I8X16),
-                    VecMisc2::Neg => ("neg", ty),
+            &Inst::VecMisc { op, rd, rn, size } => {
+                let (op, size) = match op {
+                    VecMisc2::Not => ("mvn", VectorSize::Size8x16),
+                    VecMisc2::Neg => ("neg", size),
                 };
 
-                let rd = show_vreg_vector(rd.to_reg(), mb_rru, ty);
-                let rn = show_vreg_vector(rn, mb_rru, ty);
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = show_vreg_vector(rn, mb_rru, size);
                 format!("{} {}, {}", op, rd, rn)
             }
-            &Inst::VecLanes { op, rd, rn, ty } => {
+            &Inst::VecLanes { op, rd, rn, size } => {
                 let op = match op {
                     VecLanesOp::Uminv => "uminv",
                 };
-                let size = match ty {
-                    I8X16 => ScalarSize::Size8,
-                    I16X8 => ScalarSize::Size16,
-                    I32X4 => ScalarSize::Size32,
-                    _ => unimplemented!(),
-                };
-
-                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
-                let rn = show_vreg_vector(rn, mb_rru, ty);
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size.lane_size());
+                let rn = show_vreg_vector(rn, mb_rru, size);
                 format!("{} {}, {}", op, rd, rn)
             }
             &Inst::MovToNZCV { rn } => {
diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
index 88d67fb257..cbf1440927 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -1,8 +1,8 @@
 //! AArch64 ISA definitions: registers.
 
-use crate::ir::types::*;
 use crate::isa::aarch64::inst::OperandSize;
 use crate::isa::aarch64::inst::ScalarSize;
+use crate::isa::aarch64::inst::VectorSize;
 use crate::machinst::*;
 use crate::settings;
 
@@ -307,40 +307,42 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: Scalar
 }
 
 /// Show a vector register.
-pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> String {
+pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: VectorSize) -> String {
     assert_eq!(RegClass::V128, reg.get_class());
     let mut s = reg.show_rru(mb_rru);
 
-    match ty {
-        F32X2 => s.push_str(".2s"),
-        F32X4 => s.push_str(".4s"),
-        F64X2 => s.push_str(".2d"),
-        I8X8 => s.push_str(".8b"),
-        I8X16 => s.push_str(".16b"),
-        I16X4 => s.push_str(".4h"),
-        I16X8 => s.push_str(".8h"),
-        I32X2 => s.push_str(".2s"),
-        I32X4 => s.push_str(".4s"),
-        I64X2 => s.push_str(".2d"),
-        _ => unimplemented!(),
-    }
+    let suffix = match size {
+        VectorSize::Size8x8 => ".8b",
+        VectorSize::Size8x16 => ".16b",
+        VectorSize::Size16x4 => ".4h",
+        VectorSize::Size16x8 => ".8h",
+        VectorSize::Size32x2 => ".2s",
+        VectorSize::Size32x4 => ".4s",
+        VectorSize::Size64x2 => ".2d",
+    };
 
+    s.push_str(suffix);
     s
 }
 
 /// Show an indexed vector element.
-pub fn show_vreg_element(reg: Reg, mb_rru: Option<&RealRegUniverse>, idx: u8, ty: Type) -> String {
+pub fn show_vreg_element(
+    reg: Reg,
+    mb_rru: Option<&RealRegUniverse>,
+    idx: u8,
+    size: VectorSize,
+) -> String {
     assert_eq!(RegClass::V128, reg.get_class());
     let mut s = reg.show_rru(mb_rru);
 
-    let suffix = match ty {
-        I8 => "b",
-        I16 => "h",
-        I32 => "s",
-        I64 => "d",
-        F32 => "s",
-        F64 => "d",
-        _ => unimplemented!(),
+    let suffix = match size {
+        VectorSize::Size8x8 => "b",
+        VectorSize::Size8x16 => "b",
+        VectorSize::Size16x4 => "h",
+        VectorSize::Size16x8 => "h",
+        VectorSize::Size32x2 => "s",
+        VectorSize::Size32x4 => "s",
+        VectorSize::Size64x2 => "d",
     };
 
     s.push_str(&format!(".{}[{}]", suffix, idx));
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 03a464be9a..d60fdfe144 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -14,7 +14,7 @@ use crate::ir::Inst as IRInst;
 use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
-use crate::{CodegenError, CodegenResult};
+use crate::CodegenResult;
 
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::AArch64Backend;
@@ -736,20 +736,11 @@ pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
     ty: Type,
     cond: Cond,
 ) -> CodegenResult<()> {
-    match ty {
-        F32X4 | F64X2 | I8X16 | I16X8 | I32X4 => {}
-        _ => {
-            return Err(CodegenError::Unsupported(format!(
-                "unsupported SIMD type: {:?}",
-                ty
-            )));
-        }
-    };
-
     let is_float = match ty {
         F32X4 | F64X2 => true,
         _ => false,
     };
+    let size = VectorSize::from_ty(ty);
     // 'Less than' operations are implemented by swapping
     // the order of operands and using the 'greater than'
     // instructions.
@@ -784,7 +775,7 @@ pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
         rd,
         rn,
         rm,
-        ty,
+        size,
     });
 
     if cond == Cond::Ne {
@@ -792,7 +783,7 @@ pub(crate) fn lower_vector_compare<C: LowerCtx<I = Inst>>(
             op: VecMisc2::Not,
             rd,
             rn: rd.to_reg(),
-            ty: I8X16,
+            size,
         });
     }
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 004e59441d..80b4518f9f 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -70,7 +70,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     rn,
                     rm,
                     alu_op: VecALUOp::Add,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                 });
             }
         }
@@ -89,13 +89,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     rn,
                     rm,
                     alu_op: VecALUOp::Sub,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                 });
             }
         }
         Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
-            // We use the vector instruction set's saturating adds (UQADD /
-            // SQADD), which require vector registers.
+            // We use the scalar SIMD & FP saturating additions and subtractions
+            // (SQADD / UQADD / SQSUB / UQSUB), which require scalar FP registers.
             let is_signed = op == Opcode::SaddSat || op == Opcode::SsubSat;
             let ty = ty.unwrap();
             let rd = get_output_reg(ctx, outputs[0]);
@@ -105,11 +105,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 } else {
                     NarrowValueMode::ZeroExtend64
                 };
-                let alu_op = match op {
-                    Opcode::UaddSat => VecALUOp::UQAddScalar,
-                    Opcode::SaddSat => VecALUOp::SQAddScalar,
-                    Opcode::UsubSat => VecALUOp::UQSubScalar,
-                    Opcode::SsubSat => VecALUOp::SQSubScalar,
+                let fpu_op = match op {
+                    Opcode::UaddSat => FPUOp2::Uqadd64,
+                    Opcode::SaddSat => FPUOp2::Sqadd64,
+                    Opcode::UsubSat => FPUOp2::Uqsub64,
+                    Opcode::SsubSat => FPUOp2::Sqsub64,
                     _ => unreachable!(),
                 };
                 let va = ctx.alloc_tmp(RegClass::V128, I128);
@@ -118,18 +118,17 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
                 ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
                 ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
-                ctx.emit(Inst::VecRRR {
+                ctx.emit(Inst::FpuRRR {
+                    fpu_op,
                     rd: va,
                     rn: va.to_reg(),
                     rm: vb.to_reg(),
-                    alu_op,
-                    ty: I64,
                 });
                 ctx.emit(Inst::MovFromVec {
                     rd,
                     rn: va.to_reg(),
                     idx: 0,
-                    ty: I64,
+                    size: VectorSize::Size64x2,
                 });
             } else {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -148,7 +147,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     rn,
                     rm,
                     alu_op,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                 });
             }
         }
@@ -167,7 +166,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     op: VecMisc2::Neg,
                     rd,
                     rn,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                 });
             }
         }
@@ -192,7 +191,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     rd,
                     rn,
                     rm,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                 });
             }
         }
@@ -422,7 +421,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     op: VecMisc2::Not,
                     rd,
                     rn: rm,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                 });
             }
         }
@@ -466,7 +465,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     rd,
                     rn,
                     rm,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                 });
             }
         }
@@ -495,7 +494,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
             } else {
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-
+                let size = VectorSize::from_ty(ty);
                 let (alu_op, is_right_shift) = match op {
                     Opcode::Ishl => (VecALUOp::Sshl, false),
                     Opcode::Ushr => (VecALUOp::Ushl, true),
@@ -514,18 +513,14 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     put_input_in_reg(ctx, inputs[1], NarrowValueMode::None)
                 };
 
-                ctx.emit(Inst::VecDup {
-                    rd,
-                    rn: rm,
-                    ty: ty.lane_type(),
-                });
+                ctx.emit(Inst::VecDup { rd, rn: rm, size });
 
                 ctx.emit(Inst::VecRRR {
                     alu_op,
                     rd,
                     rn,
                     rm: rd.to_reg(),
-                    ty,
+                    size,
                 });
             }
         }
@@ -1167,7 +1162,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     rd,
                     rn,
                     rm,
-                    ty,
+                    size: VectorSize::from_ty(ty),
                 });
             }
         }
@@ -1297,7 +1292,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         rd,
                         rn,
                         idx: 0,
-                        ty: I64,
+                        size: VectorSize::Size64x2,
                     });
                 }
             }
@@ -1557,15 +1552,15 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let idx = *imm;
                 let rd = get_output_reg(ctx, outputs[0]);
                 let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
                 let ty = ty.unwrap();
 
                 if ty_is_int(ty) {
-                    ctx.emit(Inst::MovFromVec { rd, rn, idx, ty });
+                    ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
                 // Plain moves are faster on some processors.
                 } else if idx == 0 {
                     ctx.emit(Inst::gen_move(rd, rn, ty));
                 } else {
-                    let size = ScalarSize::from_ty(ty);
                     ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, size });
                 }
             } else {
@@ -1576,11 +1571,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Splat => {
             let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]);
-            let ty = ctx.input_ty(insn, 0);
-            let inst = if ty_is_int(ty) {
-                Inst::VecDup { rd, rn, ty }
+            let input_ty = ctx.input_ty(insn, 0);
+            let size = VectorSize::from_ty(ty.unwrap());
+            let inst = if ty_is_int(input_ty) {
+                Inst::VecDup { rd, rn, size }
             } else {
-                Inst::VecDupFromFpu { rd, rn, ty }
+                Inst::VecDupFromFpu { rd, rn, size }
             };
             ctx.emit(inst);
         }
@@ -1598,21 +1594,22 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // cmp xm, #0
             // cset xm, ne
 
-            let input_ty = ctx.input_ty(insn, 0);
+            let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
+
             if op == Opcode::VanyTrue {
                 ctx.emit(Inst::VecRRR {
                     alu_op: VecALUOp::Umaxp,
                     rd: tmp,
                     rn: rm,
                     rm: rm,
-                    ty: input_ty,
+                    size,
                 });
             } else {
                 ctx.emit(Inst::VecLanes {
                     op: VecLanesOp::Uminv,
                     rd: tmp,
                     rn: rm,
-                    ty: input_ty,
+                    size,
                 });
             };
 
@@ -1620,7 +1617,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 rd,
                 rn: tmp.to_reg(),
                 idx: 0,
-                ty: I64,
+                size: VectorSize::Size64x2,
             });
 
             ctx.emit(Inst::AluRRImm12 {

From abf157bd6999caa3c96d902cb2396c746f91a877 Mon Sep 17 00:00:00 2001
From: Benjamin Bouvier <public@benj.me>
Date: Thu, 9 Jul 2020 17:18:22 +0200
Subject: [PATCH 02/11] machinst x64: Only use the feature flag to enable the
 x64 new backend;

Before this patch, running the x64 new backend would require both
compiling with --features experimental_x64 and running with
`use_new_backend`.

This patches changes this behavior so that the runtime flag is not
needed anymore: using the feature flag will enforce usage of the new
backend everywhere, making using and testing it much simpler:

    cargo run --features experimental_x64 ;; other CLI options/flags

This also gives a hint at what the meta language generation would look
like after switching to the new backend.

Compiling only with the x64 codegen flag gives a nice compile time speedup.
---
 cranelift/Cargo.toml                          |  1 +
 cranelift/codegen/Cargo.toml                  |  1 -
 cranelift/codegen/build.rs                    | 27 +++++++++--
 cranelift/codegen/meta/src/gen_legalizer.rs   |  7 +++
 cranelift/codegen/meta/src/isa/mod.rs         |  4 +-
 cranelift/codegen/meta/src/isa/x86/mod.rs     |  2 +-
 .../codegen/meta/src/isa/x86/settings.rs      |  6 ---
 cranelift/codegen/meta/src/lib.rs             | 48 +++++++++++++++++--
 cranelift/codegen/src/isa/mod.rs              |  6 ++-
 cranelift/codegen/src/isa/x64/mod.rs          | 28 +++++++----
 cranelift/codegen/src/isa/x64/settings.rs     |  9 ++++
 cranelift/codegen/src/isa/x86/mod.rs          | 20 +++-----
 cranelift/codegen/src/legalizer/mod.rs        | 16 ++++++-
 crates/jit/src/link.rs                        |  3 +-
 14 files changed, 134 insertions(+), 44 deletions(-)
 create mode 100644 cranelift/codegen/src/isa/x64/settings.rs

diff --git a/cranelift/Cargo.toml b/cranelift/Cargo.toml
index 7c63bea275..d72bcc4d91 100644
--- a/cranelift/Cargo.toml
+++ b/cranelift/Cargo.toml
@@ -48,3 +48,4 @@ default = ["disas", "wasm", "cranelift-codegen/all-arch"]
 disas = ["capstone"]
 enable-peepmatic = ["cranelift-codegen/enable-peepmatic", "cranelift-filetests/enable-peepmatic"]
 wasm = ["wat", "cranelift-wasm"]
+experimental_x64 = ["cranelift-codegen/x64"]
diff --git a/cranelift/codegen/Cargo.toml b/cranelift/codegen/Cargo.toml
index cdafe049e2..a78869265a 100644
--- a/cranelift/codegen/Cargo.toml
+++ b/cranelift/codegen/Cargo.toml
@@ -66,7 +66,6 @@ x64 = [] # New work-in-progress codegen backend for x86_64 based on the new isel
 # Option to enable all architectures.
 all-arch = [
     "x86",
-    "x64",
     "arm32",
     "arm64",
     "riscv"
diff --git a/cranelift/codegen/build.rs b/cranelift/codegen/build.rs
index b7352f37c3..2caf32609d 100644
--- a/cranelift/codegen/build.rs
+++ b/cranelift/codegen/build.rs
@@ -26,7 +26,15 @@ fn main() {
     let out_dir = env::var("OUT_DIR").expect("The OUT_DIR environment variable must be set");
     let target_triple = env::var("TARGET").expect("The TARGET environment variable must be set");
 
-    // Configure isa targets cfg.
+    let new_backend_isas = if env::var("CARGO_FEATURE_X64").is_ok() {
+        // The x64 (new backend for x86_64) is a bit particular: it only requires generating
+        // the shared meta code; the only ISA-specific code is for settings.
+        vec![meta::isa::Isa::X86]
+    } else {
+        Vec::new()
+    };
+
+    // Configure isa targets using the old backend.
     let isa_targets = meta::isa::Isa::all()
         .iter()
         .cloned()
@@ -36,7 +44,7 @@ fn main() {
         })
         .collect::<Vec<_>>();
 
-    let isas = if isa_targets.is_empty() {
+    let old_backend_isas = if new_backend_isas.is_empty() && isa_targets.is_empty() {
         // Try to match native target.
         let target_name = target_triple.split('-').next().unwrap();
         let isa = meta::isa_from_arch(&target_name).expect("error when identifying target");
@@ -56,14 +64,23 @@ fn main() {
         crate_dir.join("build.rs").to_str().unwrap()
     );
 
-    if let Err(err) = meta::generate(&isas, &out_dir) {
+    if let Err(err) = meta::generate(&old_backend_isas, &new_backend_isas, &out_dir) {
         eprintln!("Error: {}", err);
         process::exit(1);
     }
 
     if env::var("CRANELIFT_VERBOSE").is_ok() {
-        for isa in &isas {
-            println!("cargo:warning=Includes support for {} ISA", isa.to_string());
+        for isa in &old_backend_isas {
+            println!(
+                "cargo:warning=Includes old-backend support for {} ISA",
+                isa.to_string()
+            );
+        }
+        for isa in &new_backend_isas {
+            println!(
+                "cargo:warning=Includes new-backend support for {} ISA",
+                isa.to_string()
+            );
         }
         println!(
             "cargo:warning=Build step took {:?}.",
diff --git a/cranelift/codegen/meta/src/gen_legalizer.rs b/cranelift/codegen/meta/src/gen_legalizer.rs
index 759121894f..7b56b8db48 100644
--- a/cranelift/codegen/meta/src/gen_legalizer.rs
+++ b/cranelift/codegen/meta/src/gen_legalizer.rs
@@ -700,6 +700,7 @@ fn gen_isa(
 pub(crate) fn generate(
     isas: &[TargetIsa],
     transform_groups: &TransformGroups,
+    extra_legalization_groups: &[&'static str],
     filename_prefix: &str,
     out_dir: &str,
 ) -> Result<(), error::Error> {
@@ -711,8 +712,14 @@ pub(crate) fn generate(
         fmt.update_file(format!("{}-{}.rs", filename_prefix, isa.name), out_dir)?;
     }
 
+    // Add extra legalization groups that were explicitly requested.
+    for group in extra_legalization_groups {
+        shared_group_names.insert(group);
+    }
+
     // Generate shared legalize groups.
     let mut fmt = Formatter::new();
+    // Generate shared legalize groups.
     let mut type_sets = UniqueTable::new();
     let mut sorted_shared_group_names = Vec::from_iter(shared_group_names);
     sorted_shared_group_names.sort();
diff --git a/cranelift/codegen/meta/src/isa/mod.rs b/cranelift/codegen/meta/src/isa/mod.rs
index 39cd913300..ed8db85f0d 100644
--- a/cranelift/codegen/meta/src/isa/mod.rs
+++ b/cranelift/codegen/meta/src/isa/mod.rs
@@ -6,10 +6,10 @@ use std::fmt;
 mod arm32;
 mod arm64;
 mod riscv;
-mod x86;
+pub(crate) mod x86;
 
 /// Represents known ISA target.
-#[derive(Copy, Clone)]
+#[derive(PartialEq, Copy, Clone)]
 pub enum Isa {
     Riscv,
     X86,
diff --git a/cranelift/codegen/meta/src/isa/x86/mod.rs b/cranelift/codegen/meta/src/isa/x86/mod.rs
index 56f35770a8..a272e83900 100644
--- a/cranelift/codegen/meta/src/isa/x86/mod.rs
+++ b/cranelift/codegen/meta/src/isa/x86/mod.rs
@@ -14,7 +14,7 @@ mod legalize;
 mod opcodes;
 mod recipes;
 mod registers;
-mod settings;
+pub(crate) mod settings;
 
 pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
     let settings = settings::define(&shared_defs.settings);
diff --git a/cranelift/codegen/meta/src/isa/x86/settings.rs b/cranelift/codegen/meta/src/isa/x86/settings.rs
index 0059bf0864..dddd69abb3 100644
--- a/cranelift/codegen/meta/src/isa/x86/settings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/settings.rs
@@ -3,12 +3,6 @@ use crate::cdsl::settings::{PredicateNode, SettingGroup, SettingGroupBuilder};
 pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
     let mut settings = SettingGroupBuilder::new("x86");
 
-    settings.add_bool(
-        "use_new_backend",
-        "Whether to use the new codegen backend using the new isel",
-        false,
-    );
-
     // CPUID.01H:ECX
     let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false);
     let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false);
diff --git a/cranelift/codegen/meta/src/lib.rs b/cranelift/codegen/meta/src/lib.rs
index 796e2a110d..ead2c4442f 100644
--- a/cranelift/codegen/meta/src/lib.rs
+++ b/cranelift/codegen/meta/src/lib.rs
@@ -25,7 +25,11 @@ pub fn isa_from_arch(arch: &str) -> Result<isa::Isa, String> {
 }
 
 /// Generates all the Rust source files used in Cranelift from the meta-language.
-pub fn generate(isas: &[isa::Isa], out_dir: &str) -> Result<(), error::Error> {
+pub fn generate(
+    old_backend_isas: &[isa::Isa],
+    new_backend_isas: &[isa::Isa],
+    out_dir: &str,
+) -> Result<(), error::Error> {
     // Create all the definitions:
     // - common definitions.
     let mut shared_defs = shared::define();
@@ -39,7 +43,7 @@ pub fn generate(isas: &[isa::Isa], out_dir: &str) -> Result<(), error::Error> {
     gen_types::generate("types.rs", &out_dir)?;
 
     // - per ISA definitions.
-    let isas = isa::define(isas, &mut shared_defs);
+    let target_isas = isa::define(old_backend_isas, &mut shared_defs);
 
     // At this point, all definitions are done.
     let all_formats = shared_defs.verify_instruction_formats();
@@ -53,9 +57,22 @@ pub fn generate(isas: &[isa::Isa], out_dir: &str) -> Result<(), error::Error> {
         &out_dir,
     )?;
 
-    gen_legalizer::generate(&isas, &shared_defs.transform_groups, "legalize", &out_dir)?;
+    let extra_legalization_groups: &[&'static str] = if !new_backend_isas.is_empty() {
+        // The new backend only requires the "expand" legalization group.
+        &["expand"]
+    } else {
+        &[]
+    };
 
-    for isa in isas {
+    gen_legalizer::generate(
+        &target_isas,
+        &shared_defs.transform_groups,
+        extra_legalization_groups,
+        "legalize",
+        &out_dir,
+    )?;
+
+    for isa in target_isas {
         gen_registers::generate(&isa, &format!("registers-{}.rs", isa.name), &out_dir)?;
 
         gen_settings::generate(
@@ -80,5 +97,28 @@ pub fn generate(isas: &[isa::Isa], out_dir: &str) -> Result<(), error::Error> {
         )?;
     }
 
+    for isa in new_backend_isas {
+        match isa {
+            isa::Isa::X86 => {
+                // If the old backend ISAs contained x86, this file has already been generated.
+                if old_backend_isas.iter().any(|isa| *isa == isa::Isa::X86) {
+                    continue;
+                }
+
+                let settings = crate::isa::x86::settings::define(&shared_defs.settings);
+                gen_settings::generate(
+                    &settings,
+                    gen_settings::ParentGroup::Shared,
+                    "settings-x86.rs",
+                    &out_dir,
+                )?;
+            }
+            isa::Isa::Arm64 => {
+                // aarch64 doesn't have platform-specific settings.
+            }
+            isa::Isa::Arm32 | isa::Isa::Riscv => todo!(),
+        }
+    }
+
     Ok(())
 }
diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs
index 3bd84fbc6e..4ac40c06a4 100644
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -121,7 +121,11 @@ pub fn lookup(triple: Triple) -> Result<Builder, LookupError> {
     match triple.architecture {
         Architecture::Riscv32 | Architecture::Riscv64 => isa_builder!(riscv, "riscv", triple),
         Architecture::I386 | Architecture::I586 | Architecture::I686 | Architecture::X86_64 => {
-            isa_builder!(x86, "x86", triple)
+            if cfg!(feature = "x64") {
+                isa_builder!(x64, "x64", triple)
+            } else {
+                isa_builder!(x86, "x86", triple)
+            }
         }
         Architecture::Arm { .. } => isa_builder!(arm32, "arm32", triple),
         Architecture::Aarch64 { .. } => isa_builder!(aarch64, "arm64", triple),
diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs
index 7666875a0e..271542378a 100644
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -11,28 +11,33 @@ use crate::isa::Builder as IsaBuilder;
 use crate::machinst::pretty_print::ShowWithRRU;
 use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
 use crate::result::CodegenResult;
-use crate::settings::{self, Flags};
+use crate::settings::{self as shared_settings, Flags};
 
-use crate::isa::x64::inst::regs::create_reg_universe_systemv;
+use crate::isa::x64::{inst::regs::create_reg_universe_systemv, settings as x64_settings};
+
+use super::TargetIsa;
 
 mod abi;
 mod inst;
 mod lower;
+mod settings;
 
 /// An X64 backend.
 pub(crate) struct X64Backend {
     triple: Triple,
     flags: Flags,
+    _x64_flags: x64_settings::Flags,
     reg_universe: RealRegUniverse,
 }
 
 impl X64Backend {
     /// Create a new X64 backend with the given (shared) flags.
-    fn new_with_flags(triple: Triple, flags: Flags) -> Self {
+    fn new_with_flags(triple: Triple, flags: Flags, x64_flags: x64_settings::Flags) -> Self {
         let reg_universe = create_reg_universe_systemv(&flags);
         Self {
             triple,
             flags,
+            _x64_flags: x64_flags,
             reg_universe,
         }
     }
@@ -103,10 +108,17 @@ impl MachBackend for X64Backend {
 pub(crate) fn isa_builder(triple: Triple) -> IsaBuilder {
     IsaBuilder {
         triple,
-        setup: settings::builder(),
-        constructor: |triple: Triple, flags: Flags, _arch_flag_builder: settings::Builder| {
-            let backend = X64Backend::new_with_flags(triple, flags);
-            Box::new(TargetIsaAdapter::new(backend))
-        },
+        setup: x64_settings::builder(),
+        constructor: isa_constructor,
     }
 }
+
+fn isa_constructor(
+    triple: Triple,
+    shared_flags: Flags,
+    builder: shared_settings::Builder,
+) -> Box<dyn TargetIsa> {
+    let isa_flags = x64_settings::Flags::new(&shared_flags, builder);
+    let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags);
+    Box::new(TargetIsaAdapter::new(backend))
+}
diff --git a/cranelift/codegen/src/isa/x64/settings.rs b/cranelift/codegen/src/isa/x64/settings.rs
new file mode 100644
index 0000000000..c5371bb132
--- /dev/null
+++ b/cranelift/codegen/src/isa/x64/settings.rs
@@ -0,0 +1,9 @@
+//! x86 Settings.
+
+use crate::settings::{self, detail, Builder};
+use core::fmt;
+
+// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs:`. This file contains a
+// public `Flags` struct with an impl for all of the settings defined in
+// `cranelift-codegen/meta/src/isa/x86/settings.rs`.
+include!(concat!(env!("OUT_DIR"), "/settings-x86.rs"));
diff --git a/cranelift/codegen/src/isa/x86/mod.rs b/cranelift/codegen/src/isa/x86/mod.rs
index 4da21a879f..cbdeb3069d 100644
--- a/cranelift/codegen/src/isa/x86/mod.rs
+++ b/cranelift/codegen/src/isa/x86/mod.rs
@@ -57,20 +57,12 @@ fn isa_constructor(
 
     let isa_flags = settings::Flags::new(&shared_flags, builder);
 
-    if isa_flags.use_new_backend() {
-        #[cfg(not(feature = "x64"))]
-        panic!("new backend x86 support not included by cargo features!");
-
-        #[cfg(feature = "x64")]
-        super::x64::isa_builder(triple).finish(shared_flags)
-    } else {
-        Box::new(Isa {
-            triple,
-            isa_flags,
-            shared_flags,
-            cpumode: level1,
-        })
-    }
+    Box::new(Isa {
+        triple,
+        isa_flags,
+        shared_flags,
+        cpumode: level1,
+    })
 }
 
 impl TargetIsa for Isa {
diff --git a/cranelift/codegen/src/legalizer/mod.rs b/cranelift/codegen/src/legalizer/mod.rs
index 5bd5ac8f5a..3b33e55b1e 100644
--- a/cranelift/codegen/src/legalizer/mod.rs
+++ b/cranelift/codegen/src/legalizer/mod.rs
@@ -19,10 +19,24 @@ use crate::flowgraph::ControlFlowGraph;
 use crate::ir::types::{I32, I64};
 use crate::ir::{self, InstBuilder, MemFlags};
 use crate::isa::TargetIsa;
+
+#[cfg(any(
+    feature = "x86",
+    feature = "arm32",
+    feature = "arm64",
+    feature = "riscv"
+))]
 use crate::predicates;
+#[cfg(any(
+    feature = "x86",
+    feature = "arm32",
+    feature = "arm64",
+    feature = "riscv"
+))]
+use alloc::vec::Vec;
+
 use crate::timing;
 use alloc::collections::BTreeSet;
-use alloc::vec::Vec;
 
 mod boundary;
 mod call;
diff --git a/crates/jit/src/link.rs b/crates/jit/src/link.rs
index 68996f45ab..71284e8755 100644
--- a/crates/jit/src/link.rs
+++ b/crates/jit/src/link.rs
@@ -98,12 +98,13 @@ fn apply_reloc(
             write_unaligned(reloc_address as *mut u32, reloc_delta_u32);
         },
         #[cfg(target_pointer_width = "64")]
-        (RelocationKind::Relative, RelocationEncoding::X86Branch, 32) => unsafe {
+        (RelocationKind::Relative, RelocationEncoding::Generic, 32) => unsafe {
             let reloc_address = body.add(offset as usize) as usize;
             let reloc_addend = r.addend() as isize;
             let reloc_delta_u64 = (target_func_address as u64)
                 .wrapping_sub(reloc_address as u64)
                 .wrapping_add(reloc_addend as u64);
+            // TODO implement far calls mode in x64 new backend.
             assert!(
                 reloc_delta_u64 as isize <= i32::max_value() as isize,
                 "relocation too large to fit in i32"

From 806d197472ebcbfe32075c61fac562449ad7f48f Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 15 Jul 2020 10:22:08 -0500
Subject: [PATCH 03/11] Update platform support docs (#2023)

Be sure to mention Linux AArch64 as a supported platform of Wasmtime
now.
---
 docs/stability-platform-support.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/stability-platform-support.md b/docs/stability-platform-support.md
index d2ed7fb8bf..4d9f007848 100644
--- a/docs/stability-platform-support.md
+++ b/docs/stability-platform-support.md
@@ -10,6 +10,7 @@ snapshot of what the current state of the world looks like.
 All features of `wasmtime` should work on the following platforms:
 
 * Linux x86\_64
+* Linux aarch64
 * macOS x86\_64
 * Windows x86\_64
 
@@ -18,9 +19,8 @@ sections below!
 
 ## JIT compiler support
 
-The JIT compiler, backed by either `lightbeam` or `cranelift` supports only the
-x86\_64 architecture at this time. Support for at least ARM, AArch64, and x86 is
-planned at this time.
+The JIT compiler, backed by Cranelift, supports the x86\_64 and aarch64
+architectures at this time. Support for at least ARM and x86 is planned as well.
 
 Usage of the JIT compiler will require a host operating system which supports
 creating executable memory pages on-the-fly. In Rust terms this generally means
@@ -39,5 +39,6 @@ much else will be needed.
 The `wasmtime` project does not currently use `#[no_std]` for its crates, but
 this is not because it won't support it! At this time we're still gathering use
 cases for for what `#[no_std]` might entail, so if you're interested in this
-we'd love to hear about your use case! Feel free to open an issue on the
+we'd love to hear about your use case! Feel free to [open an
+issue](https://github.com/bytecodealliance/wasmtime/issues/new) on the
 `wasmtime` repository to discuss this.

From 0e5e8a62c85e717ad53bd1b8756c1a6f2aec4ac8 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 7 Jul 2020 13:37:37 -0700
Subject: [PATCH 04/11] Add `DerivedFunction` for doubling lane widths and
 halving the number of lanes (i.e. merging)

Certain operations (e.g. widening) will have operands with types like `NxM` but will return results with types like `(N*2)x(M/2)` (double the lane width, halve the number of lanes; maintain the same number of vector bits). This is equivalent to applying two `DerivedFunction`s to the type: `DerivedFunction::DoubleWidth` then `DerivedFunction::HalfVector`. Since there is no easy way to apply multiple `DerivedFunction`s (e.g. most of the logic is one-level deep, https://github.com/bytecodealliance/wasmtime/blob/1d5a678124e0f035f7614cafe43066c834a5113b/cranelift/codegen/meta/src/gen_inst.rs#L618-L621), I added `DerivedFunction::MergeLanes` to do the necessary type conversion.
---
 cranelift/codegen/meta/src/cdsl/typevar.rs | 29 +++++++++++++++++++++-
 cranelift/codegen/src/ir/instructions.rs   |  8 ++++++
 cranelift/codegen/src/ir/types.rs          | 13 +++++++++-
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/cranelift/codegen/meta/src/cdsl/typevar.rs b/cranelift/codegen/meta/src/cdsl/typevar.rs
index 0c0b2e9647..752b458b2a 100644
--- a/cranelift/codegen/meta/src/cdsl/typevar.rs
+++ b/cranelift/codegen/meta/src/cdsl/typevar.rs
@@ -211,6 +211,24 @@ impl TypeVar {
                     "can't double 256 lanes"
                 );
             }
+            DerivedFunc::MergeLanes => {
+                assert!(
+                    ts.ints.is_empty() || *ts.ints.iter().max().unwrap() < MAX_BITS,
+                    "can't double all integer types"
+                );
+                assert!(
+                    ts.floats.is_empty() || *ts.floats.iter().max().unwrap() < MAX_FLOAT_BITS,
+                    "can't double all float types"
+                );
+                assert!(
+                    ts.bools.is_empty() || *ts.bools.iter().max().unwrap() < MAX_BITS,
+                    "can't double all boolean types"
+                );
+                assert!(
+                    *ts.lanes.iter().min().unwrap() > 1,
+                    "can't halve a scalar type"
+                );
+            }
             DerivedFunc::LaneOf | DerivedFunc::AsBool => { /* no particular assertions */ }
         }
 
@@ -248,6 +266,9 @@ impl TypeVar {
     pub fn split_lanes(&self) -> TypeVar {
         self.derived(DerivedFunc::SplitLanes)
     }
+    pub fn merge_lanes(&self) -> TypeVar {
+        self.derived(DerivedFunc::MergeLanes)
+    }
 
     /// Constrain the range of types this variable can assume to a subset of those in the typeset
     /// ts.
@@ -355,6 +376,7 @@ pub(crate) enum DerivedFunc {
     HalfVector,
     DoubleVector,
     SplitLanes,
+    MergeLanes,
 }
 
 impl DerivedFunc {
@@ -367,6 +389,7 @@ impl DerivedFunc {
             DerivedFunc::HalfVector => "half_vector",
             DerivedFunc::DoubleVector => "double_vector",
             DerivedFunc::SplitLanes => "split_lanes",
+            DerivedFunc::MergeLanes => "merge_lanes",
         }
     }
 
@@ -377,6 +400,8 @@ impl DerivedFunc {
             DerivedFunc::DoubleWidth => Some(DerivedFunc::HalfWidth),
             DerivedFunc::HalfVector => Some(DerivedFunc::DoubleVector),
             DerivedFunc::DoubleVector => Some(DerivedFunc::HalfVector),
+            DerivedFunc::MergeLanes => Some(DerivedFunc::SplitLanes),
+            DerivedFunc::SplitLanes => Some(DerivedFunc::MergeLanes),
             _ => None,
         }
     }
@@ -462,6 +487,7 @@ impl TypeSet {
             DerivedFunc::HalfVector => self.half_vector(),
             DerivedFunc::DoubleVector => self.double_vector(),
             DerivedFunc::SplitLanes => self.half_width().double_vector(),
+            DerivedFunc::MergeLanes => self.double_width().half_vector(),
         }
     }
 
@@ -601,7 +627,8 @@ impl TypeSet {
             DerivedFunc::DoubleWidth => self.half_width(),
             DerivedFunc::HalfVector => self.double_vector(),
             DerivedFunc::DoubleVector => self.half_vector(),
-            DerivedFunc::SplitLanes => self.half_vector().double_width(),
+            DerivedFunc::SplitLanes => self.double_width().half_vector(),
+            DerivedFunc::MergeLanes => self.half_width().double_vector(),
         }
     }
 
diff --git a/cranelift/codegen/src/ir/instructions.rs b/cranelift/codegen/src/ir/instructions.rs
index f835bd5f4a..2ba730b687 100644
--- a/cranelift/codegen/src/ir/instructions.rs
+++ b/cranelift/codegen/src/ir/instructions.rs
@@ -584,6 +584,9 @@ enum OperandConstraint {
 
     /// This operand is `ctrlType.split_lanes()`.
     SplitLanes,
+
+    /// This operand is `ctrlType.merge_lanes()`.
+    MergeLanes,
 }
 
 impl OperandConstraint {
@@ -615,6 +618,11 @@ impl OperandConstraint {
                     .split_lanes()
                     .expect("invalid type for split_lanes"),
             ),
+            MergeLanes => Bound(
+                ctrl_type
+                    .merge_lanes()
+                    .expect("invalid type for merge_lanes"),
+            ),
         }
     }
 }
diff --git a/cranelift/codegen/src/ir/types.rs b/cranelift/codegen/src/ir/types.rs
index 319f3ae66f..c669839da5 100644
--- a/cranelift/codegen/src/ir/types.rs
+++ b/cranelift/codegen/src/ir/types.rs
@@ -284,7 +284,7 @@ impl Type {
 
     /// Split the lane width in half and double the number of lanes to maintain the same bit-width.
     ///
-    /// If this is a scalar type of n bits, it produces a SIMD vector type of (n/2)x2.
+    /// If this is a scalar type of `n` bits, it produces a SIMD vector type of `(n/2)x2`.
     pub fn split_lanes(self) -> Option<Self> {
         match self.half_width() {
             Some(half_width) => half_width.by(2),
@@ -292,6 +292,17 @@ impl Type {
         }
     }
 
+    /// Merge lanes to half the number of lanes and double the lane width to maintain the same
+    /// bit-width.
+    ///
+    /// If this is a scalar type, it will return `None`.
+    pub fn merge_lanes(self) -> Option<Self> {
+        match self.double_width() {
+            Some(double_width) => double_width.half_vector(),
+            None => None,
+        }
+    }
+
     /// Index of this type, for use with hash tables etc.
     pub fn index(self) -> usize {
         usize::from(self.0)

From fafef7db77e811ab329c08f5edacc78b3a3e3e53 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 7 Jul 2020 15:56:02 -0700
Subject: [PATCH 05/11] Add `x86_palignr` instructions

This instruction is necessary for implementing `[s|u]widen_high`.
---
 cranelift/codegen/meta/src/isa/x86/encodings.rs   | 10 ++++++++++
 .../codegen/meta/src/isa/x86/instructions.rs      | 15 +++++++++++++++
 cranelift/codegen/meta/src/isa/x86/opcodes.rs     |  4 ++++
 cranelift/codegen/src/isa/aarch64/lower_inst.rs   |  1 +
 .../isa/x86/simd-conversion-binemit.clif          | 10 ++++++----
 5 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index 8f4a77d814..a58348d49b 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1697,6 +1697,7 @@ fn define_simd(
     let x86_pminu = x86.by_name("x86_pminu");
     let x86_pmullq = x86.by_name("x86_pmullq");
     let x86_pmuludq = x86.by_name("x86_pmuludq");
+    let x86_palignr = x86.by_name("x86_palignr");
     let x86_pshufb = x86.by_name("x86_pshufb");
     let x86_pshufd = x86.by_name("x86_pshufd");
     let x86_psll = x86.by_name("x86_psll");
@@ -1901,6 +1902,8 @@ fn define_simd(
             rec_fa.opcodes(low),
         );
     }
+
+    // SIMD narrow/widen
     for (ty, opcodes) in &[(I16, &PACKSSWB), (I32, &PACKSSDW)] {
         let snarrow = snarrow.bind(vector(*ty, sse_vector_size));
         e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes));
@@ -1912,6 +1915,13 @@ fn define_simd(
         let unarrow = unarrow.bind(vector(*ty, sse_vector_size));
         e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap);
     }
+    for ty in &[I8, I16, I32, I64] {
+        e.enc_both_inferred_maybe_isap(
+            x86_palignr.bind(vector(*ty, sse_vector_size)),
+            rec_fa_ib.opcodes(&PALIGNR[..]),
+            Some(use_ssse3_simd),
+        );
+    }
 
     // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8).
     for from_type in ValueType::all_lane_types().filter(allowed_simd_type) {
diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs
index 0e48784f23..7acd2e2c50 100644
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -664,6 +664,21 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let c = &Operand::new("c", uimm8)
+        .with_doc("The number of bytes to shift right; see PALIGNR in Intel manual for details");
+    ig.push(
+        Inst::new(
+            "x86_palignr",
+            r#"
+        Concatenate destination and source operands, extracting a byte-aligned result shifted to 
+        the right by `c`.
+        "#,
+            &formats.ternary_imm8,
+        )
+        .operands_in(vec![x, y, c])
+        .operands_out(vec![a]),
+    );
+
     let i64_t = &TypeVar::new(
         "i64_t",
         "A scalar 64bit integer",
diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
index c357488ddd..25685593a6 100644
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -354,6 +354,10 @@ pub static PADDUSB: [u8; 3] = [0x66, 0x0f, 0xdc];
 /// Add packed unsigned word integers from xmm2/m128 and xmm1 saturate the results (SSE).
 pub static PADDUSW: [u8; 3] = [0x66, 0x0f, 0xdd];
 
+/// Concatenate destination and source operands, extract a byte-aligned result into xmm1 that is
+/// shifted to the right by the constant number of bytes in imm8 (SSSE3).
+pub static PALIGNR: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0f];
+
 /// Bitwise AND of xmm2/m128 and xmm1 (SSE2).
 pub static PAND: [u8; 3] = [0x66, 0x0f, 0xdb];
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 80b4518f9f..7fb878c87a 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2133,6 +2133,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::X86Insertps
         | Opcode::X86Movsd
         | Opcode::X86Movlhps
+        | Opcode::X86Palignr
         | Opcode::X86Psll
         | Opcode::X86Psrl
         | Opcode::X86Psra
diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
index ae1cdda753..b1a95c52d7 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
@@ -1,5 +1,6 @@
 test binemit
-target x86_64
+set enable_simd
+target x86_64 has_ssse3=true
 
 ; Ensure raw_bitcast emits no instructions.
 function %raw_bitcast_i16x8_to_b32x4() {
@@ -10,8 +11,9 @@ block0:
             return
 }
 
-function %fcvt_32(i32x4) {
-block0(v0: i32x4 [%xmm6]):
-[-, %xmm2]  v1 = fcvt_from_sint.f32x4 v0    ; bin: 40 0f 5b d6
+function %conversions_i32x4(i32x4, i32x4) {
+block0(v0: i32x4 [%xmm6], v1: i32x4 [%xmm4]):
+[-, %xmm2]  v2 = fcvt_from_sint.f32x4 v0    ; bin: 40 0f 5b d6
+[-, %xmm6]  v3 = x86_palignr v0, v1, 3      ; bin: 66 0f 3a 0f f4 03
             return
 }

From c8ddf8a34ced624b2c1fbb63bc786059a6387b29 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 7 Jul 2020 16:13:50 -0700
Subject: [PATCH 06/11] Encode `[u|s]widen_low` for x86

---
 .../codegen/meta/src/isa/x86/encodings.rs     | 12 +++
 cranelift/codegen/meta/src/isa/x86/opcodes.rs |  4 +-
 .../codegen/meta/src/shared/instructions.rs   | 81 +++++++++++++++++--
 .../codegen/src/isa/aarch64/lower_inst.rs     |  7 +-
 .../isa/x86/simd-conversion-binemit.clif      |  9 ++-
 5 files changed, 103 insertions(+), 10 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index a58348d49b..da04019a1b 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1669,6 +1669,7 @@ fn define_simd(
     let ssub_sat = shared.by_name("ssub_sat");
     let store = shared.by_name("store");
     let store_complex = shared.by_name("store_complex");
+    let swiden_low = shared.by_name("swiden_low");
     let uadd_sat = shared.by_name("uadd_sat");
     let uload8x8 = shared.by_name("uload8x8");
     let uload8x8_complex = shared.by_name("uload8x8_complex");
@@ -1678,6 +1679,7 @@ fn define_simd(
     let uload32x2_complex = shared.by_name("uload32x2_complex");
     let snarrow = shared.by_name("snarrow");
     let unarrow = shared.by_name("unarrow");
+    let uwiden_low = shared.by_name("uwiden_low");
     let ushr_imm = shared.by_name("ushr_imm");
     let usub_sat = shared.by_name("usub_sat");
     let vconst = shared.by_name("vconst");
@@ -1915,6 +1917,16 @@ fn define_simd(
         let unarrow = unarrow.bind(vector(*ty, sse_vector_size));
         e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap);
     }
+    for (ty, swiden_opcode, uwiden_opcode) in &[
+        (I8, &PMOVSXBW[..], &PMOVZXBW[..]),
+        (I16, &PMOVSXWD[..], &PMOVZXWD[..]),
+    ] {
+        let isap = Some(use_sse41_simd);
+        let swiden_low = swiden_low.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(swiden_low, rec_furm.opcodes(*swiden_opcode), isap);
+        let uwiden_low = uwiden_low.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(uwiden_low, rec_furm.opcodes(*uwiden_opcode), isap);
+    }
     for ty in &[I8, I16, I32, I64] {
         e.enc_both_inferred_maybe_isap(
             x86_palignr.bind(vector(*ty, sse_vector_size)),
diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
index 25685593a6..09c07c458f 100644
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -477,7 +477,7 @@ pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20];
 pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23];
 
 /// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
-/// integers in xmm1.
+/// integers in xmm1 (SSE4.1).
 pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25];
 
 /// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
@@ -489,7 +489,7 @@ pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30];
 pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33];
 
 /// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
-/// integers in xmm1.
+/// integers in xmm1 (SSE4.1).
 pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35];
 
 /// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index c78787ce82..1c06c4a325 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3883,9 +3883,9 @@ pub(crate) fn define(
         .constraints(vec![WiderOrEq(Int.clone(), IntTo.clone())]),
     );
 
-    let I16xN = &TypeVar::new(
-        "I16xN",
-        "A SIMD vector type containing integers 16-bits wide and up",
+    let I16or32xN = &TypeVar::new(
+        "I16or32xN",
+        "A SIMD vector type containing integer lanes 16 or 32 bits wide",
         TypeSetBuilder::new()
             .ints(16..32)
             .simd_lanes(4..8)
@@ -3893,9 +3893,9 @@ pub(crate) fn define(
             .build(),
     );
 
-    let x = &Operand::new("x", I16xN);
-    let y = &Operand::new("y", I16xN);
-    let a = &Operand::new("a", &I16xN.split_lanes());
+    let x = &Operand::new("x", I16or32xN);
+    let y = &Operand::new("y", I16or32xN);
+    let a = &Operand::new("a", &I16or32xN.split_lanes());
 
     ig.push(
         Inst::new(
@@ -3934,6 +3934,75 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let I8or16xN = &TypeVar::new(
+        "I8or16xN",
+        "A SIMD vector type containing integer lanes 8 or 16 bits wide.",
+        TypeSetBuilder::new()
+            .ints(8..16)
+            .simd_lanes(8..16)
+            .includes_scalars(false)
+            .build(),
+    );
+
+    let x = &Operand::new("x", I8or16xN);
+    let a = &Operand::new("a", &I8or16xN.merge_lanes());
+
+    ig.push(
+        Inst::new(
+            "swiden_low",
+            r#"
+        Widen the low lanes of `x` using signed extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "swiden_high",
+            r#"
+        Widen the high lanes of `x` using signed extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "uwiden_low",
+            r#"
+        Widen the low lanes of `x` using unsigned extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "uwiden_high",
+            r#"
+        Widen the high lanes of `x` using unsigned extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
     let IntTo = &TypeVar::new(
         "IntTo",
         "A larger integer type with the same number of lanes",
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 7fb878c87a..88751a1478 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2154,7 +2154,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::AvgRound => unimplemented!(),
         Opcode::Iabs => unimplemented!(),
-        Opcode::Snarrow | Opcode::Unarrow => unimplemented!(),
+        Opcode::Snarrow
+        | Opcode::Unarrow
+        | Opcode::SwidenLow
+        | Opcode::SwidenHigh
+        | Opcode::UwidenLow
+        | Opcode::UwidenHigh => unimplemented!(),
         Opcode::TlsValue => unimplemented!(),
     }
 
diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
index b1a95c52d7..72e3412279 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
@@ -1,6 +1,6 @@
 test binemit
 set enable_simd
-target x86_64 has_ssse3=true
+target x86_64 nehalem
 
 ; Ensure raw_bitcast emits no instructions.
 function %raw_bitcast_i16x8_to_b32x4() {
@@ -17,3 +17,10 @@ block0(v0: i32x4 [%xmm6], v1: i32x4 [%xmm4]):
 [-, %xmm6]  v3 = x86_palignr v0, v1, 3      ; bin: 66 0f 3a 0f f4 03
             return
 }
+
+function %conversions_i16x8(i16x8) {
+block0(v0: i16x8 [%xmm6]):
+[-, %xmm2]  v1 = swiden_low v0              ; bin: 66 0f 38 23 d6
+[-, %xmm11] v2 = uwiden_low v0              ; bin: 66 44 0f 38 33 de
+            return
+}

From f0b083c6ad2658d375abb690f711c2c0d41d0745 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 7 Jul 2020 16:22:26 -0700
Subject: [PATCH 07/11] Legalize `[u|s]widen_high` for x86

Use `x86_palignr` and `[u|s]widen_low` for legalizing this instruction.
---
 .../codegen/meta/src/isa/x86/legalize.rs      | 25 +++++++++++++++++++
 .../isa/x86/simd-conversion-legalize.clif     | 16 ++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
index 20f87ac265..de78c3b3b7 100644
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -407,13 +407,18 @@ fn define_simd(
     let umax = insts.by_name("umax");
     let umin = insts.by_name("umin");
     let snarrow = insts.by_name("snarrow");
+    let swiden_high = insts.by_name("swiden_high");
+    let swiden_low = insts.by_name("swiden_low");
     let ushr_imm = insts.by_name("ushr_imm");
     let ushr = insts.by_name("ushr");
+    let uwiden_high = insts.by_name("uwiden_high");
+    let uwiden_low = insts.by_name("uwiden_low");
     let vconst = insts.by_name("vconst");
     let vall_true = insts.by_name("vall_true");
     let vany_true = insts.by_name("vany_true");
     let vselect = insts.by_name("vselect");
 
+    let x86_palignr = x86_instructions.by_name("x86_palignr");
     let x86_pmaxs = x86_instructions.by_name("x86_pmaxs");
     let x86_pmaxu = x86_instructions.by_name("x86_pmaxu");
     let x86_pmins = x86_instructions.by_name("x86_pmins");
@@ -786,6 +791,26 @@ fn define_simd(
         );
     }
 
+    // SIMD widen
+    for ty in &[I8, I16] {
+        let swiden_high = swiden_high.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(b = swiden_high(a)),
+            vec![
+                def!(c = x86_palignr(a, a, uimm8_eight)),
+                def!(b = swiden_low(c)),
+            ],
+        );
+        let uwiden_high = uwiden_high.bind(vector(*ty, sse_vector_size));
+        narrow.legalize(
+            def!(b = uwiden_high(a)),
+            vec![
+                def!(c = x86_palignr(a, a, uimm8_eight)),
+                def!(b = uwiden_low(c)),
+            ],
+        );
+    }
+
     narrow.custom_legalize(shuffle, "convert_shuffle");
     narrow.custom_legalize(extractlane, "convert_extractlane");
     narrow.custom_legalize(insertlane, "convert_insertlane");
diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif
index ccea16de2c..0115107810 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif
@@ -52,3 +52,19 @@ block0(v0:f32x4):
     ; nextln: v1 = iadd v12, v11
     return v1
 }
+
+function %uwiden_high(i8x16) -> i16x8 {
+block0(v0: i8x16):
+    v1 = uwiden_high v0
+    ; check: v2 = x86_palignr v0, v0, 8
+    ; nextln: v1 = uwiden_low v2
+    return v1
+}
+
+function %swiden_high(i16x8) -> i32x4 {
+block0(v0: i16x8):
+    v1 = swiden_high v0
+    ; check: v2 = x86_palignr v0, v0, 8
+    ; nextln: v1 = swiden_low v2
+    return v1
+}

From 3576d8c5bb135c8a44ca6595ad041f84be11246f Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 7 Jul 2020 16:54:18 -0700
Subject: [PATCH 08/11] Translate Wasm's `widen` instructions to Cranelift's
 `[u|s]widen_[low|high]`

---
 cranelift/wasm/src/code_translator.rs | 44 ++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 64556bdddb..79eae5c2a6 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1582,17 +1582,39 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let (a, b) = pop2_with_bitcast(state, I32X4, builder);
             state.push1(builder.ins().unarrow(a, b))
         }
-        Operator::I16x8WidenLowI8x16S { .. }
-        | Operator::I16x8WidenHighI8x16S { .. }
-        | Operator::I16x8WidenLowI8x16U { .. }
-        | Operator::I16x8WidenHighI8x16U { .. }
-        | Operator::I32x4WidenLowI16x8S { .. }
-        | Operator::I32x4WidenHighI16x8S { .. }
-        | Operator::I32x4WidenLowI16x8U { .. }
-        | Operator::I32x4WidenHighI16x8U { .. }
-        | Operator::I8x16Bitmask
-        | Operator::I16x8Bitmask
-        | Operator::I32x4Bitmask => {
+        Operator::I16x8WidenLowI8x16S => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().swiden_low(a))
+        }
+        Operator::I16x8WidenHighI8x16S => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().swiden_high(a))
+        }
+        Operator::I16x8WidenLowI8x16U => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().uwiden_low(a))
+        }
+        Operator::I16x8WidenHighI8x16U => {
+            let a = pop1_with_bitcast(state, I8X16, builder);
+            state.push1(builder.ins().uwiden_high(a))
+        }
+        Operator::I32x4WidenLowI16x8S => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().swiden_low(a))
+        }
+        Operator::I32x4WidenHighI16x8S => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().swiden_high(a))
+        }
+        Operator::I32x4WidenLowI16x8U => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().uwiden_low(a))
+        }
+        Operator::I32x4WidenHighI16x8U => {
+            let a = pop1_with_bitcast(state, I16X8, builder);
+            state.push1(builder.ins().uwiden_high(a))
+        }
+        Operator::I8x16Bitmask | Operator::I16x8Bitmask | Operator::I32x4Bitmask => {
             return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
         }
 

From 6a01b32474d765d83cc2cd172629e7061930caa3 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 7 Jul 2020 16:56:48 -0700
Subject: [PATCH 09/11] Enable final SIMD spec tests for x86

---
 build.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/build.rs b/build.rs
index f658c3a6da..95c4b03b63 100644
--- a/build.rs
+++ b/build.rs
@@ -202,8 +202,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             // to be a big chunk of work to implement them all there!
             ("simd", _) if target.contains("aarch64") => return true,
 
-            ("simd", "simd_conversions") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S
-
             // TODO(#1886): Ignore reference types tests if this isn't x64,
             // because Cranelift only supports reference types on x64.
             ("reference_types", _) => {

From a817470fab2bd2b13ea730830f06d618dd2d2425 Mon Sep 17 00:00:00 2001
From: Yury Delendik <ydelendik@mozilla.com>
Date: Wed, 15 Jul 2020 14:07:36 -0500
Subject: [PATCH 10/11] Fix signature of wasmtime_module_new

---
 crates/c-api/include/wasmtime.h |  2 +-
 crates/c-api/src/module.rs      | 14 ++++++++------
 examples/externref.c            |  2 +-
 examples/fib-debug/main.c       |  2 +-
 examples/gcd.c                  |  2 +-
 examples/hello.c                |  2 +-
 examples/hello.cc               |  2 +-
 examples/interrupt.c            |  2 +-
 examples/linking.c              |  4 ++--
 examples/memory.c               |  2 +-
 examples/multi.c                |  2 +-
 examples/wasi/main.c            |  2 +-
 12 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/crates/c-api/include/wasmtime.h b/crates/c-api/include/wasmtime.h
index eb6253b728..351b15433e 100644
--- a/crates/c-api/include/wasmtime.h
+++ b/crates/c-api/include/wasmtime.h
@@ -740,7 +740,7 @@ WASM_API_EXTERN own wasmtime_error_t *wasmtime_instance_new(
  * returned error and module are owned by the caller.
  */
 WASM_API_EXTERN own wasmtime_error_t *wasmtime_module_new(
-    wasm_store_t *store,
+    wasm_engine_t *engine,
     const wasm_byte_vec_t *binary,
     own wasm_module_t **ret
 );
diff --git a/crates/c-api/src/module.rs b/crates/c-api/src/module.rs
index ede8401e2f..7d2f0ff7c4 100644
--- a/crates/c-api/src/module.rs
+++ b/crates/c-api/src/module.rs
@@ -1,6 +1,6 @@
 use crate::{
-    handle_result, wasm_byte_vec_t, wasm_exporttype_t, wasm_exporttype_vec_t, wasm_importtype_t,
-    wasm_importtype_vec_t, wasm_store_t, wasmtime_error_t,
+    handle_result, wasm_byte_vec_t, wasm_engine_t, wasm_exporttype_t, wasm_exporttype_vec_t,
+    wasm_importtype_t, wasm_importtype_vec_t, wasm_store_t, wasmtime_error_t,
 };
 use std::ptr;
 use wasmtime::{Engine, Module};
@@ -29,7 +29,10 @@ pub extern "C" fn wasm_module_new(
     binary: &wasm_byte_vec_t,
 ) -> Option<Box<wasm_module_t>> {
     let mut ret = ptr::null_mut();
-    match wasmtime_module_new(store, binary, &mut ret) {
+    let engine = wasm_engine_t {
+        engine: store.store.engine().clone(),
+    };
+    match wasmtime_module_new(&engine, binary, &mut ret) {
         Some(_err) => None,
         None => {
             assert!(!ret.is_null());
@@ -40,13 +43,12 @@ pub extern "C" fn wasm_module_new(
 
 #[no_mangle]
 pub extern "C" fn wasmtime_module_new(
-    store: &wasm_store_t,
+    engine: &wasm_engine_t,
     binary: &wasm_byte_vec_t,
     ret: &mut *mut wasm_module_t,
 ) -> Option<Box<wasmtime_error_t>> {
     let binary = binary.as_slice();
-    let store = &store.store;
-    handle_result(Module::from_binary(store.engine(), binary), |module| {
+    handle_result(Module::from_binary(&engine.engine, binary), |module| {
         let imports = module
             .imports()
             .map(|i| wasm_importtype_t::new(i.module().to_owned(), i.name().to_owned(), i.ty()))
diff --git a/examples/externref.c b/examples/externref.c
index 92785a2022..ded28bd072 100644
--- a/examples/externref.c
+++ b/examples/externref.c
@@ -66,7 +66,7 @@ int main() {
   // Now that we've got our binary webassembly we can compile our module.
   printf("Compiling module...\n");
   wasm_module_t *module = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
   wasm_byte_vec_delete(&wasm);
   if (error != NULL)
     exit_with_error("failed to compile module", error, NULL);
diff --git a/examples/fib-debug/main.c b/examples/fib-debug/main.c
index e133f8d6ac..a4e22dee3c 100644
--- a/examples/fib-debug/main.c
+++ b/examples/fib-debug/main.c
@@ -43,7 +43,7 @@ int main(int argc, const char* argv[]) {
   // Compile.
   printf("Compiling module...\n");
   wasm_module_t *module = NULL;
-  wasmtime_error_t* error = wasmtime_module_new(store, &binary, &module);
+  wasmtime_error_t* error = wasmtime_module_new(engine, &binary, &module);
   if (!module)
     exit_with_error("failed to compile module", error, NULL);
   wasm_byte_vec_delete(&binary);
diff --git a/examples/gcd.c b/examples/gcd.c
index bcbeed0940..285bc1593f 100644
--- a/examples/gcd.c
+++ b/examples/gcd.c
@@ -59,7 +59,7 @@ int main() {
 
   // Compile and instantiate our module
   wasm_module_t *module = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
   if (module == NULL)
     exit_with_error("failed to compile module", error, NULL);
   wasm_byte_vec_delete(&wasm);
diff --git a/examples/hello.c b/examples/hello.c
index f9d4b5982a..fd268a84de 100644
--- a/examples/hello.c
+++ b/examples/hello.c
@@ -67,7 +67,7 @@ int main() {
   // Now that we've got our binary webassembly we can compile our module.
   printf("Compiling module...\n");
   wasm_module_t *module = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
   wasm_byte_vec_delete(&wasm);
   if (error != NULL)
     exit_with_error("failed to compile module", error, NULL);
diff --git a/examples/hello.cc b/examples/hello.cc
index 57cfc5f360..45ac5302ec 100644
--- a/examples/hello.cc
+++ b/examples/hello.cc
@@ -67,7 +67,7 @@ int main() {
   // Now that we've got our binary webassembly we can compile our module.
   printf("Compiling module...\n");
   wasm_module_t *module = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
   wasm_byte_vec_delete(&wasm);
   if (error != NULL)
     exit_with_error("failed to compile module", error, NULL);
diff --git a/examples/interrupt.c b/examples/interrupt.c
index 81971b66bc..d2b5a3ac75 100644
--- a/examples/interrupt.c
+++ b/examples/interrupt.c
@@ -89,7 +89,7 @@ int main() {
   wasm_module_t *module = NULL;
   wasm_trap_t *trap = NULL;
   wasm_instance_t *instance = NULL;
-  error = wasmtime_module_new(store, &wasm, &module);
+  error = wasmtime_module_new(engine, &wasm, &module);
   wasm_byte_vec_delete(&wasm);
   if (error != NULL)
     exit_with_error("failed to compile module", error, NULL);
diff --git a/examples/linking.c b/examples/linking.c
index 7e6a29aed0..bc13f3cb97 100644
--- a/examples/linking.c
+++ b/examples/linking.c
@@ -45,10 +45,10 @@ int main() {
   wasmtime_error_t *error;
   wasm_module_t *linking1_module = NULL;
   wasm_module_t *linking2_module = NULL;
-  error = wasmtime_module_new(store, &linking1_wasm, &linking1_module);
+  error = wasmtime_module_new(engine, &linking1_wasm, &linking1_module);
   if (error != NULL)
     exit_with_error("failed to compile linking1", error, NULL);
-  error = wasmtime_module_new(store, &linking2_wasm, &linking2_module);
+  error = wasmtime_module_new(engine, &linking2_wasm, &linking2_module);
   if (error != NULL)
     exit_with_error("failed to compile linking2", error, NULL);
   wasm_byte_vec_delete(&linking1_wasm);
diff --git a/examples/memory.c b/examples/memory.c
index e2be709270..f430fe415b 100644
--- a/examples/memory.c
+++ b/examples/memory.c
@@ -158,7 +158,7 @@ int main(int argc, const char* argv[]) {
   // Compile.
   printf("Compiling module...\n");
   wasm_module_t* module = NULL;
-  error = wasmtime_module_new(store, &binary, &module);
+  error = wasmtime_module_new(engine, &binary, &module);
   if (error)
     exit_with_error("failed to compile module", error, NULL);
   wasm_byte_vec_delete(&binary);
diff --git a/examples/multi.c b/examples/multi.c
index 3248ec0215..a56883884f 100644
--- a/examples/multi.c
+++ b/examples/multi.c
@@ -91,7 +91,7 @@ int main(int argc, const char* argv[]) {
   // Compile.
   printf("Compiling module...\n");
   wasm_module_t* module = NULL;
-  error = wasmtime_module_new(store, &binary, &module);
+  error = wasmtime_module_new(engine, &binary, &module);
   if (error)
     exit_with_error("failed to compile module", error, NULL);
 
diff --git a/examples/wasi/main.c b/examples/wasi/main.c
index 68a978ccd2..2ad9592f4e 100644
--- a/examples/wasi/main.c
+++ b/examples/wasi/main.c
@@ -54,7 +54,7 @@ int main() {
 
   // Compile our modules
   wasm_module_t *module = NULL;
-  wasmtime_error_t *error = wasmtime_module_new(store, &wasm, &module);
+  wasmtime_error_t *error = wasmtime_module_new(engine, &wasm, &module);
   if (!module)
     exit_with_error("failed to compile module", error, NULL);
   wasm_byte_vec_delete(&wasm);

From a9455a8e5188ba70a2831279b5a3968e2c192539 Mon Sep 17 00:00:00 2001
From: Nick Fitzgerald <fitzgen@gmail.com>
Date: Wed, 15 Jul 2020 17:55:31 -0700
Subject: [PATCH 11/11] C API tweaks for wasmtime-py (#2029)

* wasmtime-c-api: Only drop non-null `*mut wasm_ref_t`s

* wasmtime-c-api: Handle null refs in `wasm_val_t` to `Val` conversion

* wasmtime-c-api: Don't unwrap and rewrap `Option`s

The `unwrap` can panic, and there isn't any point to this unwrap+rewrap.

* wasmtime-c-api: Add conversions between `funcref` and `wasm_func_t`

* wasmtime-c-api: More ownership documentation for `wasmtime.h`
---
 crates/c-api/include/wasmtime.h | 36 ++++++++++++++++++++++++++++-----
 crates/c-api/src/func.rs        | 20 +++++++++++++++++-
 crates/c-api/src/table.rs       |  2 +-
 crates/c-api/src/val.rs         | 19 +++++++++++++++--
 4 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/crates/c-api/include/wasmtime.h b/crates/c-api/include/wasmtime.h
index 351b15433e..972dfb2f8e 100644
--- a/crates/c-api/include/wasmtime.h
+++ b/crates/c-api/include/wasmtime.h
@@ -515,8 +515,7 @@ typedef own wasm_trap_t* (*wasmtime_func_callback_t)(const wasmtime_caller_t* ca
  *
  * This function is the same as #wasm_func_callback_with_env_t except that its
  * first argument is a #wasmtime_caller_t which allows learning information
- * about the
- * caller.
+ * about the caller.
  */
 typedef own wasm_trap_t* (*wasmtime_func_callback_with_env_t)(const wasmtime_caller_t* caller, void* env, const wasm_val_t args[], wasm_val_t results[]);
 
@@ -544,6 +543,28 @@ WASM_API_EXTERN own wasm_func_t* wasmtime_func_new_with_env(
   void (*finalizer)(void*)
 );
 
+/**
+ * \brief Creates a new `funcref` value referencing `func`.
+ *
+ * Create a `funcref` value that references `func` and writes it to `funcrefp`.
+ *
+ * Gives ownership fo the `funcref` value written to `funcrefp`.
+ *
+ * Both `func` and `funcrefp` must not be NULL.
+ */
+WASM_API_EXTERN void wasmtime_func_as_funcref(const wasm_func_t* func, wasm_val_t* funcrefp);
+
+/**
+ * \brief Get the `wasm_func_t*` referenced by the given `funcref` value.
+ *
+ * Gets an owning handle to the `wasm_func_t*` that the given `funcref` value is
+ * referencing. Returns NULL if the value is not a `funcref`, or if the value is
+ * a null function reference.
+ *
+ * The `val` pointer must not be NULL.
+ */
+WASM_API_EXTERN own wasm_func_t* wasmtime_funcref_as_func(const wasm_val_t* val);
+
 /**
  * \brief Loads a #wasm_extern_t from the caller's context
  *
@@ -845,8 +866,10 @@ WASM_API_EXTERN wasmtime_error_t *wasmtime_funcref_table_grow(
  * This function does not take an associated finalizer to clean up the data when
  * the reference is reclaimed. If you need a finalizer to clean up the data,
  * then use #wasmtime_externref_new_with_finalizer.
+ *
+ * Gives ownership of the newly created `externref` value.
  */
-WASM_API_EXTERN void wasmtime_externref_new(void *data, wasm_val_t *valp);
+WASM_API_EXTERN void wasmtime_externref_new(own void *data, wasm_val_t *valp);
 
 /**
  * \brief A finalizer for an `externref`'s wrapped data.
@@ -866,9 +889,11 @@ typedef void (*wasmtime_externref_finalizer_t)(void*);
  * When the reference is reclaimed, the wrapped data is cleaned up with the
  * provided finalizer. If you do not need to clean up the wrapped data, then use
  * #wasmtime_externref_new.
+ *
+ * Gives ownership of the newly created `externref` value.
  */
 WASM_API_EXTERN void wasmtime_externref_new_with_finalizer(
-    void *data,
+    own void *data,
     wasmtime_externref_finalizer_t finalizer,
     wasm_val_t *valp
 );
@@ -887,7 +912,8 @@ WASM_API_EXTERN void wasmtime_externref_new_with_finalizer(
  * If the given value is not an `externref`, returns `false` and leaves `datap`
  * unmodified.
  *
- * Does not take ownership of `val`.
+ * Does not take ownership of `val`. Does not give up ownership of the `void*`
+ * data written to `datap`.
  *
  * Both `val` and `datap` must not be `NULL`.
  */
diff --git a/crates/c-api/src/func.rs b/crates/c-api/src/func.rs
index fe494dbbcb..5c63e9782f 100644
--- a/crates/c-api/src/func.rs
+++ b/crates/c-api/src/func.rs
@@ -6,7 +6,7 @@ use std::mem::MaybeUninit;
 use std::panic::{self, AssertUnwindSafe};
 use std::ptr;
 use std::str;
-use wasmtime::{Caller, Extern, Func, Trap};
+use wasmtime::{Caller, Extern, Func, Trap, Val};
 
 #[derive(Clone)]
 #[repr(transparent)]
@@ -275,3 +275,21 @@ pub extern "C" fn wasmtime_caller_export_get(
     let which = caller.caller.get_export(name)?;
     Some(Box::new(wasm_extern_t { which }))
 }
+
+#[no_mangle]
+pub extern "C" fn wasmtime_func_as_funcref(
+    func: &wasm_func_t,
+    funcrefp: &mut MaybeUninit<wasm_val_t>,
+) {
+    let funcref = wasm_val_t::from_val(Val::FuncRef(Some(func.func().clone())));
+    crate::initialize(funcrefp, funcref);
+}
+
+#[no_mangle]
+pub extern "C" fn wasmtime_funcref_as_func(val: &wasm_val_t) -> Option<Box<wasm_func_t>> {
+    if let Val::FuncRef(Some(f)) = val.val() {
+        Some(Box::new(f.into()))
+    } else {
+        None
+    }
+}
diff --git a/crates/c-api/src/table.rs b/crates/c-api/src/table.rs
index c88620da85..6438f4976f 100644
--- a/crates/c-api/src/table.rs
+++ b/crates/c-api/src/table.rs
@@ -91,7 +91,7 @@ pub extern "C" fn wasm_table_get(
     index: wasm_table_size_t,
 ) -> Option<Box<wasm_ref_t>> {
     let val = t.table().get(index)?;
-    Some(val_into_ref(val).unwrap())
+    val_into_ref(val)
 }
 
 #[no_mangle]
diff --git a/crates/c-api/src/val.rs b/crates/c-api/src/val.rs
index 25754d4ed0..243df313f3 100644
--- a/crates/c-api/src/val.rs
+++ b/crates/c-api/src/val.rs
@@ -26,7 +26,9 @@ impl Drop for wasm_val_t {
     fn drop(&mut self) {
         match into_valtype(self.kind) {
             ValType::ExternRef => unsafe {
-                drop(Box::from_raw(self.of.ref_));
+                if !self.of.ref_.is_null() {
+                    drop(Box::from_raw(self.of.ref_));
+                }
             },
             _ => {}
         }
@@ -116,7 +118,20 @@ impl wasm_val_t {
             ValType::I64 => Val::from(unsafe { self.of.i64 }),
             ValType::F32 => Val::from(unsafe { self.of.f32 }),
             ValType::F64 => Val::from(unsafe { self.of.f64 }),
-            ValType::ExternRef | ValType::FuncRef => ref_to_val(unsafe { &*self.of.ref_ }),
+            ValType::ExternRef => unsafe {
+                if self.of.ref_.is_null() {
+                    Val::ExternRef(None)
+                } else {
+                    ref_to_val(&*self.of.ref_)
+                }
+            },
+            ValType::FuncRef => unsafe {
+                if self.of.ref_.is_null() {
+                    Val::FuncRef(None)
+                } else {
+                    ref_to_val(&*self.of.ref_)
+                }
+            },
             _ => unimplemented!("wasm_val_t::val {:?}", self.kind),
         }
     }