From 90c49a2f7c58c225d83b6b786caaac0638a89515 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 18 Sep 2019 16:28:13 -0700 Subject: [PATCH] Add saturating subtraction with a SIMD encoding This includes the new instructions `ssub_sat` and `usub_sat` and only encodes the i8x16 and i16x8 types; these are what is needed for implementing the SIMD spec (see https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md#saturating-integer-subtraction). --- .../codegen/meta/src/isa/x86/encodings.rs | 20 +++++++++++++ cranelift/codegen/meta/src/isa/x86/opcodes.rs | 16 ++++++++++ .../codegen/meta/src/shared/instructions.rs | 30 +++++++++++++++++++ .../filetests/isa/x86/simd-arithmetic.clif | 28 +++++++++++++++++ cranelift/wasm/src/code_translator.rs | 12 +++++--- 5 files changed, 102 insertions(+), 4 deletions(-) diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 7a2d725030..1e92940487 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -480,6 +480,7 @@ pub(crate) fn define( let sqrt = shared.by_name("sqrt"); let sshr = shared.by_name("sshr"); let sshr_imm = shared.by_name("sshr_imm"); + let ssub_sat = shared.by_name("ssub_sat"); let stack_addr = shared.by_name("stack_addr"); let store = shared.by_name("store"); let store_complex = shared.by_name("store_complex"); @@ -501,6 +502,7 @@ pub(crate) fn define( let uload8_complex = shared.by_name("uload8_complex"); let ushr = shared.by_name("ushr"); let ushr_imm = shared.by_name("ushr_imm"); + let usub_sat = shared.by_name("usub_sat"); let vconst = shared.by_name("vconst"); let x86_bsf = x86.by_name("x86_bsf"); let x86_bsr = x86.by_name("x86_bsr"); @@ -1965,6 +1967,24 @@ pub(crate) fn define( e.enc_32_64(isub, rec_fa.opcodes(*opcodes)); } + // SIMD integer saturating subtraction + e.enc_32_64( + ssub_sat.bind_vector_from_lane(I8, sse_vector_size), + rec_fa.opcodes(&PSUBSB), + ); + e.enc_32_64( + ssub_sat.bind_vector_from_lane(I16, sse_vector_size), + rec_fa.opcodes(&PSUBSW), + ); + e.enc_32_64( + usub_sat.bind_vector_from_lane(I8, sse_vector_size), + rec_fa.opcodes(&PSUBUSB), + ); + e.enc_32_64( + usub_sat.bind_vector_from_lane(I16, sse_vector_size), + rec_fa.opcodes(&PSUBUSW), + ); + // SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16 // and I64x2 and these are (at the time of writing) not necessary for WASM SIMD. for (ty, opcodes, isap) in &[ diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index 9d93f0cf14..f81d2423ea 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -326,6 +326,22 @@ pub static PSUBD: [u8; 3] = [0x66, 0x0f, 0xfa]; /// Subtract packed quadword integers in xmm2/m128 from xmm1 (SSE2). pub static PSUBQ: [u8; 3] = [0x66, 0x0f, 0xfb]; +/// Subtract packed signed byte integers in xmm2/m128 from packed signed byte integers in xmm1 +/// and saturate results (SSE2). +pub static PSUBSB: [u8; 3] = [0x66, 0x0f, 0xe8]; + +/// Subtract packed signed word integers in xmm2/m128 from packed signed word integers in xmm1 +/// and saturate results (SSE2). +pub static PSUBSW: [u8; 3] = [0x66, 0x0f, 0xe9]; + +/// Subtract packed unsigned byte integers in xmm2/m128 from packed unsigned byte integers in xmm1 +/// and saturate results (SSE2). +pub static PSUBUSB: [u8; 3] = [0x66, 0x0f, 0xd8]; + +/// Subtract packed unsigned word integers in xmm2/m128 from packed unsigned word integers in xmm1 +/// and saturate results (SSE2). +pub static PSUBUSW: [u8; 3] = [0x66, 0x0f, 0xd9]; + /// Push r{16,32,64}. pub static PUSH_REG: [u8; 1] = [0x50]; diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 27eb3a3498..b9070c7f39 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -1736,6 +1736,36 @@ pub(crate) fn define( .operands_out(vec![a]), ); + ig.push( + Inst::new( + "usub_sat", + r#" + Subtract with unsigned saturation. + + This is similar to `isub` but the operands are interpreted as unsigned integers and their + difference, instead of wrapping, will be saturated to the lowest unsigned integer for + the controlling type (e.g. `0x00` for i8). + "#, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "ssub_sat", + r#" + Subtract with signed saturation. + + This is similar to `isub` but the operands are interpreted as signed integers and their + difference, instead of wrapping, will be saturated to the lowest or highest + signed integer for the controlling type (e.g. `0x80` or `0x7F` for i8). + "#, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + ig.push( Inst::new( "ineg", diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic.clif index c82c5deb68..c9d6c4c372 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic.clif @@ -190,3 +190,31 @@ ebb0: return v4 } ; run + +function %sub_sat_i8x16() -> b1 { +ebb0: +[-, %xmm2] v0 = vconst.i8x16 [128 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] ; 120 == 0x80 == -128 +[-, %xmm3] v1 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] + +[-, %xmm2] v2 = ssub_sat v0, v1 ; bin: 66 0f e8 d3 + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 0x80 ; still -128, TODO it's unclear why I can't use -128 here + + ; now re-use 0x80 as an unsigned 128 +[-, %xmm2] v5 = usub_sat v0, v2 ; bin: 66 0f d8 d2 + v6 = extractlane v5, 0 + v7 = icmp_imm eq v6, 0 + + v8 = band v4, v7 + return v8 +} +; run + +function %sub_sat_i16x8() { +ebb0: +[-, %xmm3] v0 = vconst.i16x8 [0 0 0 0 0 0 0 0] +[-, %xmm5] v1 = vconst.i16x8 [1 1 1 1 1 1 1 1] +[-, %xmm3] v2 = ssub_sat v0, v1 ; bin: 66 0f e9 dd +[-, %xmm3] v3 = usub_sat v0, v1 ; bin: 66 0f d9 dd + return +} diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 85623f95fb..ad5345768d 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1012,6 +1012,14 @@ pub fn translate_operator( let (a, b) = state.pop2(); state.push1(builder.ins().isub(a, b)) } + Operator::I8x16SubSaturateS | Operator::I16x8SubSaturateS => { + let (a, b) = state.pop2(); + state.push1(builder.ins().ssub_sat(a, b)) + } + Operator::I8x16SubSaturateU | Operator::I16x8SubSaturateU => { + let (a, b) = state.pop2(); + state.push1(builder.ins().usub_sat(a, b)) + } Operator::I8x16Neg | Operator::I16x8Neg | Operator::I32x4Neg | Operator::I64x2Neg => { let a = state.pop1(); state.push1(builder.ins().ineg(a)) @@ -1072,16 +1080,12 @@ pub fn translate_operator( | Operator::I8x16Shl | Operator::I8x16ShrS | Operator::I8x16ShrU - | Operator::I8x16SubSaturateS - | Operator::I8x16SubSaturateU | Operator::I8x16Mul | Operator::I16x8AnyTrue | Operator::I16x8AllTrue | Operator::I16x8Shl | Operator::I16x8ShrS | Operator::I16x8ShrU - | Operator::I16x8SubSaturateS - | Operator::I16x8SubSaturateU | Operator::I32x4AnyTrue | Operator::I32x4AllTrue | Operator::I32x4Shl