From 90c49a2f7c58c225d83b6b786caaac0638a89515 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 18 Sep 2019 16:28:13 -0700
Subject: [PATCH] Add saturating subtraction with a SIMD encoding

This includes the new instructions `ssub_sat` and `usub_sat` and only encodes the i8x16 and i16x8 types; these are what is needed for implementing the SIMD spec (see https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md#saturating-integer-subtraction).
---
 .../codegen/meta/src/isa/x86/encodings.rs     | 20 +++++++++++++
 cranelift/codegen/meta/src/isa/x86/opcodes.rs | 16 ++++++++++
 .../codegen/meta/src/shared/instructions.rs   | 30 +++++++++++++++++++
 .../filetests/isa/x86/simd-arithmetic.clif    | 28 +++++++++++++++++
 cranelift/wasm/src/code_translator.rs         | 12 +++++---
 5 files changed, 102 insertions(+), 4 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index 7a2d725030..1e92940487 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -480,6 +480,7 @@ pub(crate) fn define(
     let sqrt = shared.by_name("sqrt");
     let sshr = shared.by_name("sshr");
     let sshr_imm = shared.by_name("sshr_imm");
+    let ssub_sat = shared.by_name("ssub_sat");
     let stack_addr = shared.by_name("stack_addr");
     let store = shared.by_name("store");
     let store_complex = shared.by_name("store_complex");
@@ -501,6 +502,7 @@ pub(crate) fn define(
     let uload8_complex = shared.by_name("uload8_complex");
     let ushr = shared.by_name("ushr");
     let ushr_imm = shared.by_name("ushr_imm");
+    let usub_sat = shared.by_name("usub_sat");
     let vconst = shared.by_name("vconst");
     let x86_bsf = x86.by_name("x86_bsf");
     let x86_bsr = x86.by_name("x86_bsr");
@@ -1965,6 +1967,24 @@ pub(crate) fn define(
         e.enc_32_64(isub, rec_fa.opcodes(*opcodes));
     }
 
+    // SIMD integer saturating subtraction
+    e.enc_32_64(
+        ssub_sat.bind_vector_from_lane(I8, sse_vector_size),
+        rec_fa.opcodes(&PSUBSB),
+    );
+    e.enc_32_64(
+        ssub_sat.bind_vector_from_lane(I16, sse_vector_size),
+        rec_fa.opcodes(&PSUBSW),
+    );
+    e.enc_32_64(
+        usub_sat.bind_vector_from_lane(I8, sse_vector_size),
+        rec_fa.opcodes(&PSUBUSB),
+    );
+    e.enc_32_64(
+        usub_sat.bind_vector_from_lane(I16, sse_vector_size),
+        rec_fa.opcodes(&PSUBUSW),
+    );
+
     // SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16
     // and I64x2 and these are (at the time of writing) not necessary for WASM SIMD.
     for (ty, opcodes, isap) in &[
diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
index 9d93f0cf14..f81d2423ea 100644
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -326,6 +326,22 @@ pub static PSUBD: [u8; 3] = [0x66, 0x0f, 0xfa];
 /// Subtract packed quadword integers in xmm2/m128 from xmm1 (SSE2).
 pub static PSUBQ: [u8; 3] = [0x66, 0x0f, 0xfb];
 
+/// Subtract packed signed byte integers in xmm2/m128 from packed signed byte integers in xmm1
+/// and saturate results (SSE2).
+pub static PSUBSB: [u8; 3] = [0x66, 0x0f, 0xe8];
+
+/// Subtract packed signed word integers in xmm2/m128 from packed signed word integers in xmm1
+/// and saturate results (SSE2).
+pub static PSUBSW: [u8; 3] = [0x66, 0x0f, 0xe9];
+
+/// Subtract packed unsigned byte integers in xmm2/m128 from packed unsigned byte integers in xmm1
+/// and saturate results (SSE2).
+pub static PSUBUSB: [u8; 3] = [0x66, 0x0f, 0xd8];
+
+/// Subtract packed unsigned word integers in xmm2/m128 from packed unsigned word integers in xmm1
+/// and saturate results (SSE2).
+pub static PSUBUSW: [u8; 3] = [0x66, 0x0f, 0xd9];
+
 /// Push r{16,32,64}.
 pub static PUSH_REG: [u8; 1] = [0x50];
 
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index 27eb3a3498..b9070c7f39 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -1736,6 +1736,36 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    ig.push(
+        Inst::new(
+            "usub_sat",
+            r#"
+        Subtract with unsigned saturation.
+
+        This is similar to `isub` but the operands are interpreted as unsigned integers and their 
+        difference, instead of wrapping, will be saturated to the lowest unsigned integer for
+        the controlling type (e.g. `0x00` for i8).
+        "#,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "ssub_sat",
+            r#"
+        Subtract with signed saturation.
+
+        This is similar to `isub` but the operands are interpreted as signed integers and their 
+        difference, instead of wrapping, will be saturated to the lowest or highest 
+        signed integer for the controlling type (e.g. `0x80` or `0x7F` for i8).
+        "#,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
     ig.push(
         Inst::new(
             "ineg",
diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic.clif
index c82c5deb68..c9d6c4c372 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic.clif
@@ -190,3 +190,31 @@ ebb0:
     return v4
 }
 ; run
+
+function %sub_sat_i8x16() -> b1 {
+ebb0:
+[-, %xmm2]    v0 = vconst.i8x16 [128 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] ; 120 == 0x80 == -128
+[-, %xmm3]    v1 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+[-, %xmm2]    v2 = ssub_sat v0, v1 ; bin: 66 0f e8 d3
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 0x80 ; still -128, TODO it's unclear why I can't use -128 here
+
+    ; now re-use 0x80 as an unsigned 128
+[-, %xmm2]    v5 = usub_sat v0, v2 ; bin: 66 0f d8 d2
+    v6 = extractlane v5, 0
+    v7 = icmp_imm eq v6, 0
+
+    v8 = band v4, v7
+    return v8
+}
+; run
+
+function %sub_sat_i16x8() {
+ebb0:
+[-, %xmm3]    v0 = vconst.i16x8 [0 0 0 0 0 0 0 0]
+[-, %xmm5]    v1 = vconst.i16x8 [1 1 1 1 1 1 1 1]
+[-, %xmm3]    v2 = ssub_sat v0, v1 ; bin: 66 0f e9 dd
+[-, %xmm3]    v3 = usub_sat v0, v1 ; bin: 66 0f d9 dd
+    return
+}
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 85623f95fb..ad5345768d 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1012,6 +1012,14 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let (a, b) = state.pop2();
             state.push1(builder.ins().isub(a, b))
         }
+        Operator::I8x16SubSaturateS | Operator::I16x8SubSaturateS => {
+            let (a, b) = state.pop2();
+            state.push1(builder.ins().ssub_sat(a, b))
+        }
+        Operator::I8x16SubSaturateU | Operator::I16x8SubSaturateU => {
+            let (a, b) = state.pop2();
+            state.push1(builder.ins().usub_sat(a, b))
+        }
         Operator::I8x16Neg | Operator::I16x8Neg | Operator::I32x4Neg | Operator::I64x2Neg => {
             let a = state.pop1();
             state.push1(builder.ins().ineg(a))
@@ -1072,16 +1080,12 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::I8x16Shl
         | Operator::I8x16ShrS
         | Operator::I8x16ShrU
-        | Operator::I8x16SubSaturateS
-        | Operator::I8x16SubSaturateU
         | Operator::I8x16Mul
         | Operator::I16x8AnyTrue
         | Operator::I16x8AllTrue
         | Operator::I16x8Shl
         | Operator::I16x8ShrS
         | Operator::I16x8ShrU
-        | Operator::I16x8SubSaturateS
-        | Operator::I16x8SubSaturateU
         | Operator::I32x4AnyTrue
         | Operator::I32x4AllTrue
         | Operator::I32x4Shl