Add x86 implentation of 8x16 ishl
This involves some large mask tables that may hurt code size but reduce the number of instructions. See https://github.com/WebAssembly/simd/issues/117 for a more in-depth discussion on this.
This commit is contained in:
@@ -355,7 +355,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
let x86_pminu = x86_instructions.by_name("x86_pminu");
|
let x86_pminu = x86_instructions.by_name("x86_pminu");
|
||||||
let x86_pshufb = x86_instructions.by_name("x86_pshufb");
|
let x86_pshufb = x86_instructions.by_name("x86_pshufb");
|
||||||
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
|
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
|
||||||
let x86_psll = x86_instructions.by_name("x86_psll");
|
|
||||||
let x86_psra = x86_instructions.by_name("x86_psra");
|
let x86_psra = x86_instructions.by_name("x86_psra");
|
||||||
let x86_ptest = x86_instructions.by_name("x86_ptest");
|
let x86_ptest = x86_instructions.by_name("x86_ptest");
|
||||||
|
|
||||||
@@ -485,16 +484,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// SIMD shift left (logical)
|
|
||||||
for ty in &[I16, I32, I64] {
|
|
||||||
let ishl = ishl.bind(vector(*ty, sse_vector_size));
|
|
||||||
let bitcast = bitcast.bind(vector(I64, sse_vector_size));
|
|
||||||
narrow.legalize(
|
|
||||||
def!(a = ishl(x, y)),
|
|
||||||
vec![def!(b = bitcast(y)), def!(a = x86_psll(x, b))],
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// SIMD shift left (arithmetic)
|
// SIMD shift left (arithmetic)
|
||||||
for ty in &[I16, I32, I64] {
|
for ty in &[I16, I32, I64] {
|
||||||
let sshr = sshr.bind(vector(*ty, sse_vector_size));
|
let sshr = sshr.bind(vector(*ty, sse_vector_size));
|
||||||
@@ -685,6 +674,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
|
|||||||
narrow.custom_legalize(insertlane, "convert_insertlane");
|
narrow.custom_legalize(insertlane, "convert_insertlane");
|
||||||
narrow.custom_legalize(ineg, "convert_ineg");
|
narrow.custom_legalize(ineg, "convert_ineg");
|
||||||
narrow.custom_legalize(ushr, "convert_ushr");
|
narrow.custom_legalize(ushr, "convert_ushr");
|
||||||
|
narrow.custom_legalize(ishl, "convert_ishl");
|
||||||
|
|
||||||
narrow.build_and_add_to(&mut shared.transform_groups);
|
narrow.build_and_add_to(&mut shared.transform_groups);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1318,7 +1318,7 @@ fn convert_ineg(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unsigned shift masks for i8x16 shift.
|
// Masks for i8x16 unsigned right shift.
|
||||||
static USHR_MASKS: [u8; 128] = [
|
static USHR_MASKS: [u8; 128] = [
|
||||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
|
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
|
||||||
@@ -1385,6 +1385,73 @@ fn convert_ushr(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Masks for i8x16 left shift.
|
||||||
|
static SHL_MASKS: [u8; 128] = [
|
||||||
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
|
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
|
||||||
|
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
|
||||||
|
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
|
||||||
|
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
|
||||||
|
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
|
||||||
|
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
|
||||||
|
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
];
|
||||||
|
|
||||||
|
// Convert a vector left shift. x86 has implementations for i16x8 and up (see `x86_psll`),
|
||||||
|
// but for i8x16 we translate the shift to a i16x8 shift and mask off the lower bits. This same
|
||||||
|
// conversion could be provided in the CDSL if we could use varargs there (TODO); i.e. `load_complex`
|
||||||
|
// has a varargs field that we can't modify with the CDSL in legalize.rs.
|
||||||
|
fn convert_ishl(
|
||||||
|
inst: ir::Inst,
|
||||||
|
func: &mut ir::Function,
|
||||||
|
_cfg: &mut ControlFlowGraph,
|
||||||
|
isa: &dyn TargetIsa,
|
||||||
|
) {
|
||||||
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
||||||
|
pos.use_srcloc(inst);
|
||||||
|
|
||||||
|
if let ir::InstructionData::Binary {
|
||||||
|
opcode: ir::Opcode::Ishl,
|
||||||
|
args: [arg0, arg1],
|
||||||
|
} = pos.func.dfg[inst]
|
||||||
|
{
|
||||||
|
// Note that for Wasm, the bounding of the shift index has happened during translation
|
||||||
|
let arg0_type = pos.func.dfg.value_type(arg0);
|
||||||
|
let arg1_type = pos.func.dfg.value_type(arg1);
|
||||||
|
assert!(!arg1_type.is_vector() && arg1_type.is_int());
|
||||||
|
|
||||||
|
// TODO it may be more clear to use scalar_to_vector here; the current issue is that
|
||||||
|
// scalar_to_vector has the restriction that the vector produced has a matching lane size
|
||||||
|
// (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 ->
|
||||||
|
// i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift
|
||||||
|
// index type--this could be relaxed since it is not really meaningful.
|
||||||
|
let shift_index = pos.ins().bitcast(I64X2, arg1);
|
||||||
|
|
||||||
|
if arg0_type == I8X16 {
|
||||||
|
// First, shift the vector using an I16X8 shift.
|
||||||
|
let bitcasted = pos.ins().raw_bitcast(I16X8, arg0);
|
||||||
|
let shifted = pos.ins().x86_psll(bitcasted, shift_index);
|
||||||
|
let shifted = pos.ins().raw_bitcast(I8X16, shifted);
|
||||||
|
|
||||||
|
// Then, fixup the even lanes that have incorrect lower bits. This uses the 128 mask
|
||||||
|
// bytes as a table that we index into. It is a substantial code-size increase but
|
||||||
|
// reduces the instruction count slightly.
|
||||||
|
let masks = pos.func.dfg.constants.insert(SHL_MASKS.as_ref().into());
|
||||||
|
let mask_address = pos.ins().const_addr(isa.pointer_type(), masks);
|
||||||
|
let mask_offset = pos.ins().ishl_imm(arg1, 4);
|
||||||
|
let mask =
|
||||||
|
pos.ins()
|
||||||
|
.load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0);
|
||||||
|
pos.func.dfg.replace(inst).band(shifted, mask);
|
||||||
|
} else if arg0_type.is_vector() {
|
||||||
|
// x86 has encodings for these shifts.
|
||||||
|
pos.func.dfg.replace(inst).x86_psll(arg0, shift_index);
|
||||||
|
} else {
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn expand_tls_value(
|
fn expand_tls_value(
|
||||||
inst: ir::Inst,
|
inst: ir::Inst,
|
||||||
func: &mut ir::Function,
|
func: &mut ir::Function,
|
||||||
|
|||||||
@@ -18,6 +18,22 @@ block0:
|
|||||||
return v2
|
return v2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function %ishl_i8x16() -> i8x16 {
|
||||||
|
block0:
|
||||||
|
v0 = iconst.i32 1
|
||||||
|
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
||||||
|
v2 = ishl v1, v0
|
||||||
|
; check: v3 = bitcast.i64x2 v0
|
||||||
|
; nextln: v4 = raw_bitcast.i16x8 v1
|
||||||
|
; nextln: v5 = x86_psll v4, v3
|
||||||
|
; nextln: v6 = raw_bitcast.i8x16 v5
|
||||||
|
; nextln: v7 = const_addr.i64 const1
|
||||||
|
; nextln: v8 = ishl_imm v0, 4
|
||||||
|
; nextln: v9 = load_complex.i8x16 v7+v8
|
||||||
|
; nextln: v2 = band v6, v9
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
function %ishl_i32x4() -> i32x4 {
|
function %ishl_i32x4() -> i32x4 {
|
||||||
block0:
|
block0:
|
||||||
v0 = iconst.i32 1
|
v0 = iconst.i32 1
|
||||||
|
|||||||
@@ -51,6 +51,19 @@ block0:
|
|||||||
}
|
}
|
||||||
; run
|
; run
|
||||||
|
|
||||||
|
function %ishl_i8x16() -> b1 {
|
||||||
|
block0:
|
||||||
|
v0 = iconst.i32 1
|
||||||
|
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
|
||||||
|
v2 = ishl v1, v0
|
||||||
|
|
||||||
|
v3 = vconst.i8x16 [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30]
|
||||||
|
v4 = icmp eq v2, v3
|
||||||
|
v5 = vall_true v4
|
||||||
|
return v5
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|
||||||
function %ushr_i64x2() -> b1 {
|
function %ushr_i64x2() -> b1 {
|
||||||
block0:
|
block0:
|
||||||
v0 = iconst.i32 1
|
v0 = iconst.i32 1
|
||||||
|
|||||||
Reference in New Issue
Block a user