Add x86 legalization for imul.i64x2 for non-AVX CPUs
The `convert_i64x2_imul` custom legalization checks the ISA flags for AVX512DQ or AVX512VL support and legalizes `imul.i64x2` to an `x86_pmullq` in this case; if not, it uses a lengthy SSE2-compatible instruction sequence.
This commit is contained in:
@@ -28,6 +28,17 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
.isa("x86")
|
.isa("x86")
|
||||||
.chain_with(shared.transform_groups.by_name("narrow_flags").id);
|
.chain_with(shared.transform_groups.by_name("narrow_flags").id);
|
||||||
|
|
||||||
|
let mut narrow_avx = TransformGroupBuilder::new(
|
||||||
|
"x86_narrow_avx",
|
||||||
|
r#"
|
||||||
|
Legalize instructions by narrowing with CPU feature checks.
|
||||||
|
|
||||||
|
This special case converts using x86 AVX instructions where available."#,
|
||||||
|
)
|
||||||
|
.isa("x86");
|
||||||
|
// We cannot chain with the x86_narrow group until this group is built, see bottom of this
|
||||||
|
// function for where this is chained.
|
||||||
|
|
||||||
let mut widen = TransformGroupBuilder::new(
|
let mut widen = TransformGroupBuilder::new(
|
||||||
"x86_widen",
|
"x86_widen",
|
||||||
r#"
|
r#"
|
||||||
@@ -343,10 +354,13 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
widen.custom_legalize(ineg, "convert_ineg");
|
widen.custom_legalize(ineg, "convert_ineg");
|
||||||
|
|
||||||
// To reduce compilation times, separate out large blocks of legalizations by theme.
|
// To reduce compilation times, separate out large blocks of legalizations by theme.
|
||||||
define_simd(shared, x86_instructions, &mut narrow, &mut expand);
|
define_simd(shared, x86_instructions, &mut narrow, &mut narrow_avx);
|
||||||
|
|
||||||
expand.build_and_add_to(&mut shared.transform_groups);
|
expand.build_and_add_to(&mut shared.transform_groups);
|
||||||
narrow.build_and_add_to(&mut shared.transform_groups);
|
let narrow_id = narrow.build_and_add_to(&mut shared.transform_groups);
|
||||||
|
narrow_avx
|
||||||
|
.chain_with(narrow_id)
|
||||||
|
.build_and_add_to(&mut shared.transform_groups);
|
||||||
widen.build_and_add_to(&mut shared.transform_groups);
|
widen.build_and_add_to(&mut shared.transform_groups);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -354,7 +368,7 @@ fn define_simd(
|
|||||||
shared: &mut SharedDefinitions,
|
shared: &mut SharedDefinitions,
|
||||||
x86_instructions: &InstructionGroup,
|
x86_instructions: &InstructionGroup,
|
||||||
narrow: &mut TransformGroupBuilder,
|
narrow: &mut TransformGroupBuilder,
|
||||||
expand: &mut TransformGroupBuilder,
|
narrow_avx: &mut TransformGroupBuilder,
|
||||||
) {
|
) {
|
||||||
let insts = &shared.instructions;
|
let insts = &shared.instructions;
|
||||||
let band = insts.by_name("band");
|
let band = insts.by_name("band");
|
||||||
@@ -767,12 +781,6 @@ fn define_simd(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// SIMD imul
|
|
||||||
{
|
|
||||||
let imul = imul.bind(vector(I64, sse_vector_size));
|
|
||||||
narrow.legalize(def!(c = imul(a, b)), vec![def!(c = x86_pmullq(a, b))]);
|
|
||||||
}
|
|
||||||
|
|
||||||
narrow.custom_legalize(shuffle, "convert_shuffle");
|
narrow.custom_legalize(shuffle, "convert_shuffle");
|
||||||
narrow.custom_legalize(extractlane, "convert_extractlane");
|
narrow.custom_legalize(extractlane, "convert_extractlane");
|
||||||
narrow.custom_legalize(insertlane, "convert_insertlane");
|
narrow.custom_legalize(insertlane, "convert_insertlane");
|
||||||
@@ -780,5 +788,6 @@ fn define_simd(
|
|||||||
narrow.custom_legalize(ushr, "convert_ushr");
|
narrow.custom_legalize(ushr, "convert_ushr");
|
||||||
narrow.custom_legalize(ishl, "convert_ishl");
|
narrow.custom_legalize(ishl, "convert_ishl");
|
||||||
|
|
||||||
narrow.build_and_add_to(&mut shared.transform_groups);
|
// This lives in the expand group to avoid conflicting with, e.g., i128 legalizations.
|
||||||
|
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use crate::cdsl::cpu_modes::CpuMode;
|
use crate::cdsl::cpu_modes::CpuMode;
|
||||||
use crate::cdsl::isa::TargetIsa;
|
use crate::cdsl::isa::TargetIsa;
|
||||||
use crate::cdsl::types::ReferenceType;
|
use crate::cdsl::types::{ReferenceType, VectorType};
|
||||||
|
|
||||||
use crate::shared::types::Bool::B1;
|
use crate::shared::types::Bool::B1;
|
||||||
use crate::shared::types::Float::{F32, F64};
|
use crate::shared::types::Float::{F32, F64};
|
||||||
@@ -35,6 +35,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
|
|||||||
let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
|
let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
|
||||||
let x86_widen = shared_defs.transform_groups.by_name("x86_widen");
|
let x86_widen = shared_defs.transform_groups.by_name("x86_widen");
|
||||||
let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
|
let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
|
||||||
|
let x86_narrow_avx = shared_defs.transform_groups.by_name("x86_narrow_avx");
|
||||||
let x86_expand = shared_defs.transform_groups.by_name("x86_expand");
|
let x86_expand = shared_defs.transform_groups.by_name("x86_expand");
|
||||||
|
|
||||||
x86_32.legalize_monomorphic(expand_flags);
|
x86_32.legalize_monomorphic(expand_flags);
|
||||||
@@ -46,6 +47,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
|
|||||||
x86_32.legalize_value_type(ReferenceType(R32), x86_expand);
|
x86_32.legalize_value_type(ReferenceType(R32), x86_expand);
|
||||||
x86_32.legalize_type(F32, x86_expand);
|
x86_32.legalize_type(F32, x86_expand);
|
||||||
x86_32.legalize_type(F64, x86_expand);
|
x86_32.legalize_type(F64, x86_expand);
|
||||||
|
x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
|
||||||
|
|
||||||
x86_64.legalize_monomorphic(expand_flags);
|
x86_64.legalize_monomorphic(expand_flags);
|
||||||
x86_64.legalize_default(x86_narrow);
|
x86_64.legalize_default(x86_narrow);
|
||||||
@@ -57,6 +59,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
|
|||||||
x86_64.legalize_value_type(ReferenceType(R64), x86_expand);
|
x86_64.legalize_value_type(ReferenceType(R64), x86_expand);
|
||||||
x86_64.legalize_type(F32, x86_expand);
|
x86_64.legalize_type(F32, x86_expand);
|
||||||
x86_64.legalize_type(F64, x86_expand);
|
x86_64.legalize_type(F64, x86_expand);
|
||||||
|
x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
|
||||||
|
|
||||||
let recipes = recipes::define(shared_defs, &settings, ®s);
|
let recipes = recipes::define(shared_defs, &settings, ®s);
|
||||||
|
|
||||||
|
|||||||
@@ -1513,6 +1513,53 @@ fn convert_ishl(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert an imul.i64x2 to a valid code sequence on x86, first with AVX512 and then with SSE2.
|
||||||
|
fn convert_i64x2_imul(
|
||||||
|
inst: ir::Inst,
|
||||||
|
func: &mut ir::Function,
|
||||||
|
_cfg: &mut ControlFlowGraph,
|
||||||
|
isa: &dyn TargetIsa,
|
||||||
|
) {
|
||||||
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
||||||
|
pos.use_srcloc(inst);
|
||||||
|
|
||||||
|
if let ir::InstructionData::Binary {
|
||||||
|
opcode: ir::Opcode::Imul,
|
||||||
|
args: [arg0, arg1],
|
||||||
|
} = pos.func.dfg[inst]
|
||||||
|
{
|
||||||
|
let ty = pos.func.dfg.ctrl_typevar(inst);
|
||||||
|
if ty == I64X2 {
|
||||||
|
let x86_isa = isa
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<isa::x86::Isa>()
|
||||||
|
.expect("the target ISA must be x86 at this point");
|
||||||
|
if x86_isa.isa_flags.use_avx512dq_simd() || x86_isa.isa_flags.use_avx512vl_simd() {
|
||||||
|
// If we have certain AVX512 features, we can lower this instruction simply.
|
||||||
|
pos.func.dfg.replace(inst).x86_pmullq(arg0, arg1);
|
||||||
|
} else {
|
||||||
|
// Otherwise, we default to a very lengthy SSE2-compatible sequence. It splits each
|
||||||
|
// 64-bit lane into 32-bit high and low sections using shifting and then performs
|
||||||
|
// the following arithmetic per lane: with arg0 = concat(high0, low0) and arg1 =
|
||||||
|
// concat(high1, low1), calculate (high0 * low1) + (high1 * low0) + (low0 * low1).
|
||||||
|
let high0 = pos.ins().ushr_imm(arg0, 32);
|
||||||
|
let mul0 = pos.ins().x86_pmuludq(high0, arg1);
|
||||||
|
let high1 = pos.ins().ushr_imm(arg1, 32);
|
||||||
|
let mul1 = pos.ins().x86_pmuludq(high1, arg0);
|
||||||
|
let addhigh = pos.ins().iadd(mul0, mul1);
|
||||||
|
let high = pos.ins().ishl_imm(addhigh, 32);
|
||||||
|
let low = pos.ins().x86_pmuludq(arg0, arg1);
|
||||||
|
pos.func.dfg.replace(inst).iadd(low, high);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
unreachable!(
|
||||||
|
"{} should be encodable; it cannot be legalized by convert_i64x2_imul",
|
||||||
|
pos.func.dfg.display_inst(inst, None)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn expand_tls_value(
|
fn expand_tls_value(
|
||||||
inst: ir::Inst,
|
inst: ir::Inst,
|
||||||
func: &mut ir::Function,
|
func: &mut ir::Function,
|
||||||
|
|||||||
@@ -70,9 +70,16 @@ block0:
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
function %imul(i64x2, i64x2) {
|
function %imul_i64x2(i64x2, i64x2) {
|
||||||
block0(v0:i64x2, v1:i64x2):
|
block0(v0:i64x2, v1:i64x2):
|
||||||
v2 = imul v0, v1
|
v2 = imul v0, v1
|
||||||
; check: v2 = x86_pmullq v0, v1
|
; check: v3 = ushr_imm v0, 32
|
||||||
|
; nextln: v4 = x86_pmuludq v3, v1
|
||||||
|
; nextln: v5 = ushr_imm v1, 32
|
||||||
|
; nextln: v6 = x86_pmuludq v5, v0
|
||||||
|
; nextln: v7 = iadd v4, v6
|
||||||
|
; nextln: v8 = ishl_imm v7, 32
|
||||||
|
; nextln: v9 = x86_pmuludq v0, v1
|
||||||
|
; nextln: v2 = iadd v9, v8
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,6 +49,13 @@ block0:
|
|||||||
}
|
}
|
||||||
; run
|
; run
|
||||||
|
|
||||||
|
function %imul_i64x2(i64x2, i64x2) -> i64x2 {
|
||||||
|
block0(v0: i64x2, v1: i64x2):
|
||||||
|
v2 = imul v0, v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run: %imul_i64x2([0 2], [0 2]) == [0 4]
|
||||||
|
|
||||||
function %imul_i32x4() -> b1 {
|
function %imul_i32x4() -> b1 {
|
||||||
block0:
|
block0:
|
||||||
v0 = vconst.i32x4 [-1 0 1 0x80_00_00_01]
|
v0 = vconst.i32x4 [-1 0 1 0x80_00_00_01]
|
||||||
|
|||||||
@@ -0,0 +1,10 @@
|
|||||||
|
test legalizer
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 skylake has_avx512dq=true
|
||||||
|
|
||||||
|
function %imul_i64x2(i64x2, i64x2) {
|
||||||
|
block0(v0:i64x2, v1:i64x2):
|
||||||
|
v2 = imul v0, v1
|
||||||
|
; check: v2 = x86_pmullq v0, v1
|
||||||
|
return
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user