From 057c93b64e0fba69a347a088217df70a64396516 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 1 Jul 2020 10:46:59 -0700 Subject: [PATCH] Add `unarrow` instruction with x86 implementation Adds a shared `unarrow` instruction in order to lower the Wasm SIMD specification's unsigned narrowing (see https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md#integer-to-integer-narrowing). Additionally, this commit implements the instruction for x86 using PACKUSWB and PACKUSDW for the applicable encodings. --- .../codegen/meta/src/isa/x86/encodings.rs | 8 ++++++++ cranelift/codegen/meta/src/isa/x86/opcodes.rs | 10 +++++++++- .../codegen/meta/src/shared/instructions.rs | 20 +++++++++++++++++++ .../codegen/src/isa/aarch64/lower_inst.rs | 2 +- .../isa/x86/simd-lane-access-binemit.clif | 3 ++- .../isa/x86/simd-lane-access-run.clif | 7 +++++++ 6 files changed, 47 insertions(+), 3 deletions(-) diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index a1d4de8ca5..8f4a77d814 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1677,6 +1677,7 @@ fn define_simd( let uload32x2 = shared.by_name("uload32x2"); let uload32x2_complex = shared.by_name("uload32x2_complex"); let snarrow = shared.by_name("snarrow"); + let unarrow = shared.by_name("unarrow"); let ushr_imm = shared.by_name("ushr_imm"); let usub_sat = shared.by_name("usub_sat"); let vconst = shared.by_name("vconst"); @@ -1904,6 +1905,13 @@ fn define_simd( let snarrow = snarrow.bind(vector(*ty, sse_vector_size)); e.enc_both_inferred(snarrow, rec_fa.opcodes(*opcodes)); } + for (ty, opcodes, isap) in &[ + (I16, &PACKUSWB[..], None), + (I32, &PACKUSDW[..], Some(use_sse41_simd)), + ] { + let unarrow = unarrow.bind(vector(*ty, sse_vector_size)); + e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap); + } // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8). for from_type in ValueType::all_lane_types().filter(allowed_simd_type) { diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index f7f7480f9b..c357488ddd 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -314,7 +314,7 @@ pub static PABSD: [u8; 4] = [0x66, 0x0f, 0x38, 0x1e]; /// xmm1 (SSSE3). pub static PABSW: [u8; 4] = [0x66, 0x0f, 0x38, 0x1d]; -/// Converts 8 packed signed word integers from xmm1 and from xxm2/m128 into 16 packed signed byte +/// Converts 8 packed signed word integers from xmm1 and from xmm2/m128 into 16 packed signed byte /// integers in xmm1 using signed saturation (SSE2). pub static PACKSSWB: [u8; 3] = [0x66, 0x0f, 0x63]; @@ -322,6 +322,14 @@ pub static PACKSSWB: [u8; 3] = [0x66, 0x0f, 0x63]; /// word integers in xmm1 using signed saturation (SSE2). pub static PACKSSDW: [u8; 3] = [0x66, 0x0f, 0x6b]; +/// Converts 8 packed signed word integers from xmm1 and from xmm2/m128 into 16 packed unsigned byte +/// integers in xmm1 using unsigned saturation (SSE2). +pub static PACKUSWB: [u8; 3] = [0x66, 0x0f, 0x67]; + +/// Converts 4 packed signed doubleword integers from xmm1 and from xmm2/m128 into 8 unpacked signed +/// word integers in xmm1 using unsigned saturation (SSE4.1). +pub static PACKUSDW: [u8; 4] = [0x66, 0x0f, 0x38, 0x2b]; + /// Add packed byte integers from xmm2/m128 and xmm1 (SSE2). pub static PADDB: [u8; 3] = [0x66, 0x0f, 0xfc]; diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index bad56b5f27..c78787ce82 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -3914,6 +3914,26 @@ pub(crate) fn define( .operands_out(vec![a]), ); + ig.push( + Inst::new( + "unarrow", + r#" + Combine `x` and `y` into a vector with twice the lanes but half the integer width while + saturating overflowing values to the unsigned maximum and minimum. + + Note that all input lanes are considered signed: any negative lanes will overflow and be + replaced with the unsigned minimum, `0x00`. + + The lanes will be concatenated after narrowing. For example, when `x` and `y` are `i32x4` + and `x = [x3, x2, x1, x0]` and `y = [y3, y2, y1, y0]`, then after narrowing the value + returned is an `i16x8`: `a = [y3', y2', y1', y0', x3', x2', x1', x0']`. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + let IntTo = &TypeVar::new( "IntTo", "A larger integer type with the same number of lanes", diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 2c67c1cd46..340d83ea29 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2070,7 +2070,7 @@ pub(crate) fn lower_insn_to_regs>( Opcode::AvgRound => unimplemented!(), Opcode::Iabs => unimplemented!(), - Opcode::Snarrow => unimplemented!(), + Opcode::Snarrow | Opcode::Unarrow => unimplemented!(), Opcode::TlsValue => unimplemented!(), } diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif index e15d059eef..abee26fa4b 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif @@ -118,8 +118,9 @@ block0(v0: i32x4 [%xmm7], v1: i32x4 [%xmm6]): return } -function %snarrow_i16x8(i16x8, i16x8) { +function %narrowing_i16x8(i16x8, i16x8) { block0(v0: i16x8 [%xmm7], v1: i16x8 [%xmm8]): [-, %xmm7] v2 = snarrow v0, v1 ; bin: 66 41 0f 63 f8 +[-, %xmm7] v3 = unarrow v0, v1 ; bin: 66 41 0f 67 f8 return } diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif index 013ea78679..0d58472a36 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif @@ -212,3 +212,10 @@ block0(v0: i32x4, v1: i32x4): return v2 } ; run: %snarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 -1 0x7fff 4 5 -6 0xffff] + +function %unarrow(i32x4, i32x4) -> i16x8 { +block0(v0: i32x4, v1: i32x4): + v2 = unarrow v0, v1 + return v2 +} +; run: %unarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 0 0xffff 4 5 0 0]