Legalize 64 bit shifts on x86_32 using PSLLQ/PSRLQ.
Co-authored-by: iximeow <git@iximeow.net>
This commit is contained in:
@@ -1493,8 +1493,13 @@ fn define_alu(
|
|||||||
for &(inst, rrr) in &[(rotl, 0), (rotr, 1), (ishl, 4), (ushr, 5), (sshr, 7)] {
|
for &(inst, rrr) in &[(rotl, 0), (rotr, 1), (ishl, 4), (ushr, 5), (sshr, 7)] {
|
||||||
// Cannot use enc_i32_i64 for this pattern because instructions require
|
// Cannot use enc_i32_i64 for this pattern because instructions require
|
||||||
// to bind any.
|
// to bind any.
|
||||||
|
e.enc32(inst.bind(I32).bind(I8), rec_rc.opcodes(&ROTATE_CL).rrr(rrr));
|
||||||
e.enc32(
|
e.enc32(
|
||||||
inst.bind(I32).bind(Any),
|
inst.bind(I32).bind(I16),
|
||||||
|
rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
|
||||||
|
);
|
||||||
|
e.enc32(
|
||||||
|
inst.bind(I32).bind(I32),
|
||||||
rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
|
rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
|
||||||
);
|
);
|
||||||
e.enc64(
|
e.enc64(
|
||||||
|
|||||||
@@ -37,6 +37,8 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
let imul = insts.by_name("imul");
|
let imul = insts.by_name("imul");
|
||||||
let ineg = insts.by_name("ineg");
|
let ineg = insts.by_name("ineg");
|
||||||
let isub = insts.by_name("isub");
|
let isub = insts.by_name("isub");
|
||||||
|
let ishl = insts.by_name("ishl");
|
||||||
|
let ireduce = insts.by_name("ireduce");
|
||||||
let popcnt = insts.by_name("popcnt");
|
let popcnt = insts.by_name("popcnt");
|
||||||
let sdiv = insts.by_name("sdiv");
|
let sdiv = insts.by_name("sdiv");
|
||||||
let selectif = insts.by_name("selectif");
|
let selectif = insts.by_name("selectif");
|
||||||
@@ -45,6 +47,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
let tls_value = insts.by_name("tls_value");
|
let tls_value = insts.by_name("tls_value");
|
||||||
let udiv = insts.by_name("udiv");
|
let udiv = insts.by_name("udiv");
|
||||||
let umulhi = insts.by_name("umulhi");
|
let umulhi = insts.by_name("umulhi");
|
||||||
|
let ushr = insts.by_name("ushr");
|
||||||
let ushr_imm = insts.by_name("ushr_imm");
|
let ushr_imm = insts.by_name("ushr_imm");
|
||||||
let urem = insts.by_name("urem");
|
let urem = insts.by_name("urem");
|
||||||
|
|
||||||
@@ -55,6 +58,32 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
|
|
||||||
let imm = &shared.imm;
|
let imm = &shared.imm;
|
||||||
|
|
||||||
|
// Shift by a 64-bit amount is equivalent to a shift by that amount mod 32, so we can reduce
|
||||||
|
// the size of the shift amount. This is useful for x86_32, where an I64 shift amount is
|
||||||
|
// not encodable.
|
||||||
|
let a = var("a");
|
||||||
|
let x = var("x");
|
||||||
|
let y = var("y");
|
||||||
|
let z = var("z");
|
||||||
|
|
||||||
|
for &ty in &[I8, I16, I32] {
|
||||||
|
let ishl_by_i64 = ishl.bind(ty).bind(I64);
|
||||||
|
let ireduce = ireduce.bind(I32);
|
||||||
|
group.legalize(
|
||||||
|
def!(a = ishl_by_i64(x, y)),
|
||||||
|
vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
for &ty in &[I8, I16, I32] {
|
||||||
|
let ushr_by_i64 = ushr.bind(ty).bind(I64);
|
||||||
|
let ireduce = ireduce.bind(I32);
|
||||||
|
group.legalize(
|
||||||
|
def!(a = ushr_by_i64(x, y)),
|
||||||
|
vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Division and remainder.
|
// Division and remainder.
|
||||||
//
|
//
|
||||||
// The srem expansion requires custom code because srem INT_MIN, -1 is not
|
// The srem expansion requires custom code because srem INT_MIN, -1 is not
|
||||||
|
|||||||
@@ -1318,6 +1318,39 @@ fn convert_ineg(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn expand_dword_to_xmm<'f>(
|
||||||
|
pos: &mut FuncCursor<'_>,
|
||||||
|
arg: ir::Value,
|
||||||
|
arg_type: ir::Type,
|
||||||
|
) -> ir::Value {
|
||||||
|
if arg_type == I64 {
|
||||||
|
let (arg_lo, arg_hi) = pos.ins().isplit(arg);
|
||||||
|
let arg = pos.ins().scalar_to_vector(I32X4, arg_lo);
|
||||||
|
let arg = pos.ins().insertlane(arg, 1, arg_hi);
|
||||||
|
let arg = pos.ins().raw_bitcast(I64X2, arg);
|
||||||
|
arg
|
||||||
|
} else {
|
||||||
|
pos.ins().bitcast(I64X2, arg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn contract_dword_from_xmm<'f>(
|
||||||
|
pos: &mut FuncCursor<'f>,
|
||||||
|
inst: ir::Inst,
|
||||||
|
ret: ir::Value,
|
||||||
|
ret_type: ir::Type,
|
||||||
|
) {
|
||||||
|
if ret_type == I64 {
|
||||||
|
let ret = pos.ins().raw_bitcast(I32X4, ret);
|
||||||
|
let ret_lo = pos.ins().extractlane(ret, 0);
|
||||||
|
let ret_hi = pos.ins().extractlane(ret, 1);
|
||||||
|
pos.func.dfg.replace(inst).iconcat(ret_lo, ret_hi);
|
||||||
|
} else {
|
||||||
|
let ret = pos.ins().extractlane(ret, 0);
|
||||||
|
pos.func.dfg.replace(inst).ireduce(ret_type, ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Masks for i8x16 unsigned right shift.
|
// Masks for i8x16 unsigned right shift.
|
||||||
static USHR_MASKS: [u8; 128] = [
|
static USHR_MASKS: [u8; 128] = [
|
||||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
@@ -1379,7 +1412,14 @@ fn convert_ushr(
|
|||||||
} else if arg0_type.is_vector() {
|
} else if arg0_type.is_vector() {
|
||||||
// x86 has encodings for these shifts.
|
// x86 has encodings for these shifts.
|
||||||
pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index);
|
pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index);
|
||||||
|
} else if arg0_type == I64 {
|
||||||
|
// 64 bit shifts need to be legalized on x86_32.
|
||||||
|
let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type);
|
||||||
|
let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type);
|
||||||
|
let shifted = pos.ins().x86_psrl(value, amount);
|
||||||
|
contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type);
|
||||||
} else {
|
} else {
|
||||||
|
// Everything else should be already legal.
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1446,7 +1486,14 @@ fn convert_ishl(
|
|||||||
} else if arg0_type.is_vector() {
|
} else if arg0_type.is_vector() {
|
||||||
// x86 has encodings for these shifts.
|
// x86 has encodings for these shifts.
|
||||||
pos.func.dfg.replace(inst).x86_psll(arg0, shift_index);
|
pos.func.dfg.replace(inst).x86_psll(arg0, shift_index);
|
||||||
|
} else if arg0_type == I64 {
|
||||||
|
// 64 bit shifts need to be legalized on x86_32.
|
||||||
|
let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type);
|
||||||
|
let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type);
|
||||||
|
let shifted = pos.ins().x86_psll(value, amount);
|
||||||
|
contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type);
|
||||||
} else {
|
} else {
|
||||||
|
// Everything else should be already legal.
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,51 @@
|
|||||||
|
test compile
|
||||||
|
set enable_simd
|
||||||
|
target i686 haswell
|
||||||
|
|
||||||
|
function u0:1(i32) -> i64 system_v {
|
||||||
|
block1(v0: i32):
|
||||||
|
v1 = load.i64 notrap aligned v0+0
|
||||||
|
v2 = load.i32 notrap aligned v0+16
|
||||||
|
v3 = ishl v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function u0:2(i32) -> i64 system_v {
|
||||||
|
block1(v0: i32):
|
||||||
|
v1 = load.i64 notrap aligned v0+0
|
||||||
|
v2 = load.i64 notrap aligned v0+16
|
||||||
|
v3 = ishl v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function u0:3(i32) -> i32 system_v {
|
||||||
|
block1(v0: i32):
|
||||||
|
v1 = load.i32 notrap aligned v0+0
|
||||||
|
v2 = load.i64 notrap aligned v0+16
|
||||||
|
v3 = ishl v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function u0:4(i32) -> i64 system_v {
|
||||||
|
block1(v0: i32):
|
||||||
|
v1 = load.i64 notrap aligned v0+0
|
||||||
|
v2 = load.i32 notrap aligned v0+16
|
||||||
|
v3 = ushr v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function u0:5(i32) -> i64 system_v {
|
||||||
|
block1(v0: i32):
|
||||||
|
v1 = load.i64 notrap aligned v0+0
|
||||||
|
v2 = load.i64 notrap aligned v0+16
|
||||||
|
v3 = ushr v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
function u0:6(i32) -> i32 system_v {
|
||||||
|
block1(v0: i32):
|
||||||
|
v1 = load.i32 notrap aligned v0+0
|
||||||
|
v2 = load.i64 notrap aligned v0+16
|
||||||
|
v3 = ushr v1, v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user