From 3f47291f2e31d456d35c313ee89ab0349c7aa0ef Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Fri, 20 Mar 2020 18:59:20 -0700 Subject: [PATCH] Add x86 implentation of 8x16 `ushr` This involves some large mask tables that may hurt code size but reduce the number of instructions. See https://github.com/WebAssembly/simd/issues/117 for a more in-depth discussion on this. --- .../codegen/meta/src/isa/x86/legalize.rs | 12 +--- cranelift/codegen/src/isa/x86/enc_tables.rs | 69 ++++++++++++++++++- .../isa/x86/simd-bitwise-legalize.clif | 16 +++++ .../filetests/isa/x86/simd-bitwise-run.clif | 13 ++++ 4 files changed, 98 insertions(+), 12 deletions(-) diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index e3027a6814..8b732dbca3 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -357,7 +357,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro let x86_pshufd = x86_instructions.by_name("x86_pshufd"); let x86_psll = x86_instructions.by_name("x86_psll"); let x86_psra = x86_instructions.by_name("x86_psra"); - let x86_psrl = x86_instructions.by_name("x86_psrl"); let x86_ptest = x86_instructions.by_name("x86_ptest"); let imm = &shared.imm; @@ -496,16 +495,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro ); } - // SIMD shift right (logical) - for ty in &[I16, I32, I64] { - let ushr = ushr.bind(vector(*ty, sse_vector_size)); - let bitcast = bitcast.bind(vector(I64, sse_vector_size)); - narrow.legalize( - def!(a = ushr(x, y)), - vec![def!(b = bitcast(y)), def!(a = x86_psrl(x, b))], - ); - } - // SIMD shift left (arithmetic) for ty in &[I16, I32, I64] { let sshr = sshr.bind(vector(*ty, sse_vector_size)); @@ -695,6 +684,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro narrow.custom_legalize(extractlane, "convert_extractlane"); narrow.custom_legalize(insertlane, "convert_insertlane"); narrow.custom_legalize(ineg, "convert_ineg"); + narrow.custom_legalize(ushr, "convert_ushr"); narrow.build_and_add_to(&mut shared.transform_groups); } diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index c00182f7cd..b77b27d0ca 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -6,7 +6,7 @@ use crate::cursor::{Cursor, FuncCursor}; use crate::flowgraph::ControlFlowGraph; use crate::ir::condcodes::{FloatCC, IntCC}; use crate::ir::types::*; -use crate::ir::{self, Function, Inst, InstBuilder}; +use crate::ir::{self, Function, Inst, InstBuilder, MemFlags}; use crate::isa::constraints::*; use crate::isa::enc_tables::*; use crate::isa::encoding::base_size; @@ -1318,6 +1318,73 @@ fn convert_ineg( } } +// Unsigned shift masks for i8x16 shift. +static USHR_MASKS: [u8; 128] = [ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, +]; + +// Convert a vector unsigned right shift. x86 has implementations for i16x8 and up (see `x86_pslr`), +// but for i8x16 we translate the shift to a i16x8 shift and mask off the upper bits. This same +// conversion could be provided in the CDSL if we could use varargs there (TODO); i.e. `load_complex` +// has a varargs field that we can't modify with the CDSL in legalize.rs. +fn convert_ushr( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Binary { + opcode: ir::Opcode::Ushr, + args: [arg0, arg1], + } = pos.func.dfg[inst] + { + // Note that for Wasm, the bounding of the shift index has happened during translation + let arg0_type = pos.func.dfg.value_type(arg0); + let arg1_type = pos.func.dfg.value_type(arg1); + assert!(!arg1_type.is_vector() && arg1_type.is_int()); + + // TODO it may be more clear to use scalar_to_vector here; the current issue is that + // scalar_to_vector has the restriction that the vector produced has a matching lane size + // (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 -> + // i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift + // index type--this could be relaxed since it is not really meaningful. + let shift_index = pos.ins().bitcast(I64X2, arg1); + + if arg0_type == I8X16 { + // First, shift the vector using an I16X8 shift. + let bitcasted = pos.ins().raw_bitcast(I16X8, arg0); + let shifted = pos.ins().x86_psrl(bitcasted, shift_index); + let shifted = pos.ins().raw_bitcast(I8X16, shifted); + + // Then, fixup the even lanes that have incorrect upper bits. This uses the 128 mask + // bytes as a table that we index into. It is a substantial code-size increase but + // reduces the instruction count slightly. + let masks = pos.func.dfg.constants.insert(USHR_MASKS.as_ref().into()); + let mask_address = pos.ins().const_addr(isa.pointer_type(), masks); + let mask_offset = pos.ins().ishl_imm(arg1, 4); + let mask = + pos.ins() + .load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0); + pos.func.dfg.replace(inst).band(shifted, mask); + } else if arg0_type.is_vector() { + // x86 has encodings for these shifts. + pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index); + } else { + unreachable!() + } + } +} + fn expand_tls_value( inst: ir::Inst, func: &mut ir::Function, diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif index af7036b27a..a4e3ec4f0d 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif @@ -2,6 +2,22 @@ test legalizer set enable_simd target x86_64 skylake +function %ushr_i8x16() -> i8x16 { +block0: + v0 = iconst.i32 1 + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = ushr v1, v0 + ; check: v3 = bitcast.i64x2 v0 + ; nextln: v4 = raw_bitcast.i16x8 v1 + ; nextln: v5 = x86_psrl v4, v3 + ; nextln: v6 = raw_bitcast.i8x16 v5 + ; nextln: v7 = const_addr.i64 const1 + ; nextln: v8 = ishl_imm v0, 4 + ; nextln: v9 = load_complex.i8x16 v7+v8 + ; nextln: v2 = band v6, v9 + return v2 +} + function %ishl_i32x4() -> i32x4 { block0: v0 = iconst.i32 1 diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif index 670c501c9b..306a0aba6e 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif @@ -38,6 +38,19 @@ block0: } ; run +function %ushr_i8x16() -> b1 { +block0: + v0 = iconst.i32 1 + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = ushr v1, v0 + + v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7] + v4 = icmp eq v2, v3 + v5 = vall_true v4 + return v5 +} +; run + function %ushr_i64x2() -> b1 { block0: v0 = iconst.i32 1