Add x86 implentation of 8x16 ushr

This involves some large mask tables that may hurt code size but reduce the number of instructions. See https://github.com/WebAssembly/simd/issues/117 for a more in-depth discussion on this.
This commit is contained in:
Andrew Brown
2020-03-20 18:59:20 -07:00
parent 39c0a28d77
commit 3f47291f2e
4 changed files with 98 additions and 12 deletions

View File

@@ -357,7 +357,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
let x86_pshufd = x86_instructions.by_name("x86_pshufd"); let x86_pshufd = x86_instructions.by_name("x86_pshufd");
let x86_psll = x86_instructions.by_name("x86_psll"); let x86_psll = x86_instructions.by_name("x86_psll");
let x86_psra = x86_instructions.by_name("x86_psra"); let x86_psra = x86_instructions.by_name("x86_psra");
let x86_psrl = x86_instructions.by_name("x86_psrl");
let x86_ptest = x86_instructions.by_name("x86_ptest"); let x86_ptest = x86_instructions.by_name("x86_ptest");
let imm = &shared.imm; let imm = &shared.imm;
@@ -496,16 +495,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
); );
} }
// SIMD shift right (logical)
for ty in &[I16, I32, I64] {
let ushr = ushr.bind(vector(*ty, sse_vector_size));
let bitcast = bitcast.bind(vector(I64, sse_vector_size));
narrow.legalize(
def!(a = ushr(x, y)),
vec![def!(b = bitcast(y)), def!(a = x86_psrl(x, b))],
);
}
// SIMD shift left (arithmetic) // SIMD shift left (arithmetic)
for ty in &[I16, I32, I64] { for ty in &[I16, I32, I64] {
let sshr = sshr.bind(vector(*ty, sse_vector_size)); let sshr = sshr.bind(vector(*ty, sse_vector_size));
@@ -695,6 +684,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
narrow.custom_legalize(extractlane, "convert_extractlane"); narrow.custom_legalize(extractlane, "convert_extractlane");
narrow.custom_legalize(insertlane, "convert_insertlane"); narrow.custom_legalize(insertlane, "convert_insertlane");
narrow.custom_legalize(ineg, "convert_ineg"); narrow.custom_legalize(ineg, "convert_ineg");
narrow.custom_legalize(ushr, "convert_ushr");
narrow.build_and_add_to(&mut shared.transform_groups); narrow.build_and_add_to(&mut shared.transform_groups);
} }

View File

@@ -6,7 +6,7 @@ use crate::cursor::{Cursor, FuncCursor};
use crate::flowgraph::ControlFlowGraph; use crate::flowgraph::ControlFlowGraph;
use crate::ir::condcodes::{FloatCC, IntCC}; use crate::ir::condcodes::{FloatCC, IntCC};
use crate::ir::types::*; use crate::ir::types::*;
use crate::ir::{self, Function, Inst, InstBuilder}; use crate::ir::{self, Function, Inst, InstBuilder, MemFlags};
use crate::isa::constraints::*; use crate::isa::constraints::*;
use crate::isa::enc_tables::*; use crate::isa::enc_tables::*;
use crate::isa::encoding::base_size; use crate::isa::encoding::base_size;
@@ -1318,6 +1318,73 @@ fn convert_ineg(
} }
} }
// Unsigned shift masks for i8x16 shift.
static USHR_MASKS: [u8; 128] = [
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
];
// Convert a vector unsigned right shift. x86 has implementations for i16x8 and up (see `x86_pslr`),
// but for i8x16 we translate the shift to a i16x8 shift and mask off the upper bits. This same
// conversion could be provided in the CDSL if we could use varargs there (TODO); i.e. `load_complex`
// has a varargs field that we can't modify with the CDSL in legalize.rs.
fn convert_ushr(
inst: ir::Inst,
func: &mut ir::Function,
_cfg: &mut ControlFlowGraph,
isa: &dyn TargetIsa,
) {
let mut pos = FuncCursor::new(func).at_inst(inst);
pos.use_srcloc(inst);
if let ir::InstructionData::Binary {
opcode: ir::Opcode::Ushr,
args: [arg0, arg1],
} = pos.func.dfg[inst]
{
// Note that for Wasm, the bounding of the shift index has happened during translation
let arg0_type = pos.func.dfg.value_type(arg0);
let arg1_type = pos.func.dfg.value_type(arg1);
assert!(!arg1_type.is_vector() && arg1_type.is_int());
// TODO it may be more clear to use scalar_to_vector here; the current issue is that
// scalar_to_vector has the restriction that the vector produced has a matching lane size
// (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 ->
// i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift
// index type--this could be relaxed since it is not really meaningful.
let shift_index = pos.ins().bitcast(I64X2, arg1);
if arg0_type == I8X16 {
// First, shift the vector using an I16X8 shift.
let bitcasted = pos.ins().raw_bitcast(I16X8, arg0);
let shifted = pos.ins().x86_psrl(bitcasted, shift_index);
let shifted = pos.ins().raw_bitcast(I8X16, shifted);
// Then, fixup the even lanes that have incorrect upper bits. This uses the 128 mask
// bytes as a table that we index into. It is a substantial code-size increase but
// reduces the instruction count slightly.
let masks = pos.func.dfg.constants.insert(USHR_MASKS.as_ref().into());
let mask_address = pos.ins().const_addr(isa.pointer_type(), masks);
let mask_offset = pos.ins().ishl_imm(arg1, 4);
let mask =
pos.ins()
.load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0);
pos.func.dfg.replace(inst).band(shifted, mask);
} else if arg0_type.is_vector() {
// x86 has encodings for these shifts.
pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index);
} else {
unreachable!()
}
}
}
fn expand_tls_value( fn expand_tls_value(
inst: ir::Inst, inst: ir::Inst,
func: &mut ir::Function, func: &mut ir::Function,

View File

@@ -2,6 +2,22 @@ test legalizer
set enable_simd set enable_simd
target x86_64 skylake target x86_64 skylake
function %ushr_i8x16() -> i8x16 {
block0:
v0 = iconst.i32 1
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
v2 = ushr v1, v0
; check: v3 = bitcast.i64x2 v0
; nextln: v4 = raw_bitcast.i16x8 v1
; nextln: v5 = x86_psrl v4, v3
; nextln: v6 = raw_bitcast.i8x16 v5
; nextln: v7 = const_addr.i64 const1
; nextln: v8 = ishl_imm v0, 4
; nextln: v9 = load_complex.i8x16 v7+v8
; nextln: v2 = band v6, v9
return v2
}
function %ishl_i32x4() -> i32x4 { function %ishl_i32x4() -> i32x4 {
block0: block0:
v0 = iconst.i32 1 v0 = iconst.i32 1

View File

@@ -38,6 +38,19 @@ block0:
} }
; run ; run
function %ushr_i8x16() -> b1 {
block0:
v0 = iconst.i32 1
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
v2 = ushr v1, v0
v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
v4 = icmp eq v2, v3
v5 = vall_true v4
return v5
}
; run
function %ushr_i64x2() -> b1 { function %ushr_i64x2() -> b1 {
block0: block0:
v0 = iconst.i32 1 v0 = iconst.i32 1