From fa7481a68191bfe91154ef9a2be3e7c84a111f32 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Fri, 6 Mar 2020 11:29:21 -0800 Subject: [PATCH] Add x86 implementation of SIMD swizzle instruction --- .../codegen/meta/src/isa/x86/legalize.rs | 18 +++++++++++ .../codegen/meta/src/shared/instructions.rs | 31 ++++++++++++++++++- .../isa/x86/simd-lane-access-legalize.clif | 11 +++++++ .../isa/x86/simd-lane-access-run.clif | 26 ++++++++++++++++ 4 files changed, 85 insertions(+), 1 deletion(-) diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 4a10c64ed2..e3027a6814 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -338,7 +338,9 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro let splat = insts.by_name("splat"); let shuffle = insts.by_name("shuffle"); let sshr = insts.by_name("sshr"); + let swizzle = insts.by_name("swizzle"); let trueif = insts.by_name("trueif"); + let uadd_sat = insts.by_name("uadd_sat"); let umax = insts.by_name("umax"); let umin = insts.by_name("umin"); let ushr_imm = insts.by_name("ushr_imm"); @@ -375,6 +377,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro let uimm8_one = Literal::constant(&imm.uimm8, 0x01); let u128_zeroes = constant(vec![0x00; 16]); let u128_ones = constant(vec![0xff; 16]); + let u128_seventies = constant(vec![0x70; 16]); let a = var("a"); let b = var("b"); let c = var("c"); @@ -459,6 +462,21 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro ); } + // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec requiring + // mask indexes greater than 15 to have the same semantics as a 0 index. For the spec discussion, + // see https://github.com/WebAssembly/simd/issues/93. + { + let swizzle = swizzle.bind(vector(I8, sse_vector_size)); + narrow.legalize( + def!(a = swizzle(x, y)), + vec![ + def!(b = vconst(u128_seventies)), + def!(c = uadd_sat(y, b)), + def!(a = x86_pshufb(x, c)), + ], + ); + } + // SIMD bnot for ty in ValueType::all_lane_types().filter(allowed_simd_type) { let bnot = bnot.bind(vector(ty, sse_vector_size)); diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 6fd7db155f..9bc344eec6 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -517,7 +517,36 @@ fn define_simd_lane_access( .operands_out(vec![a]), ); - let x = &Operand::new("x", TxN).with_doc("SIMD vector to modify"); + let I8x16 = &TypeVar::new( + "I8x16", + "A SIMD vector type consisting of 16 lanes of 8-bit integers", + TypeSetBuilder::new() + .ints(8..8) + .simd_lanes(16..16) + .includes_scalars(false) + .build(), + ); + let x = &Operand::new("x", I8x16).with_doc("Vector to modify by re-arranging lanes"); + let y = &Operand::new("y", I8x16).with_doc("Mask for re-arranging lanes"); + + ig.push( + Inst::new( + "swizzle", + r#" + Vector swizzle. + + Returns a new vector with byte-width lanes selected from the lanes of the first input + vector ``x`` specified in the second input vector ``s``. The indices ``i`` in range + ``[0, 15]`` select the ``i``-th element of ``x``. For indices outside of the range the + resulting lane is 0. Note that this operates on byte-width lanes. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + let x = &Operand::new("x", TxN).with_doc("The vector to modify"); let y = &Operand::new("y", &TxN.lane_of()).with_doc("New lane value"); let Idx = &Operand::new("Idx", &imm.uimm8).with_doc("Lane index"); diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-legalize.clif index 547acbd5c4..3c222f1f7f 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-legalize.clif @@ -83,3 +83,14 @@ block0: ; nextln: v4 = vconst.i8x16 0x00 ; nextln: v1 = x86_pshufb v3, v4 ; nextln: return v1 + +function %swizzle() -> i8x16 { +block0: + v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = swizzle.i8x16 v0, v1 + ; check: v3 = vconst.i8x16 0x70707070707070707070707070707070 + ; nextln: v4 = uadd_sat v1, v3 + ; nextln: v2 = x86_pshufb v0, v4 + return v2 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif index 80445cc67f..a99660f8fd 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif @@ -165,3 +165,29 @@ block0: return v8 } ; run + +function %swizzle() -> b1 { +block0: + v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v1 = vconst.i8x16 [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 42] + v2 = swizzle.i8x16 v0, v1 ; reverse the lanes, with over-large index 42 using lane 0 + + v3 = vconst.i8x16 [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + v4 = icmp eq v2, v3 + v5 = vall_true v4 + return v5 +} +; run: + +function %swizzle_with_overflow() -> b1 { +block0: + v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v1 = vconst.i8x16 [16 250 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + v2 = swizzle.i8x16 v0, v1 ; 250 should overflow but saturate so that the MSB is set (PSHUFB uses this to shuffle from lane 0) + + v3 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + v4 = icmp eq v2, v3 + v5 = vall_true v4 + return v5 +} +; run: