Infer REX prefixes for SIMD load_extend
This commit is contained in:
@@ -1945,15 +1945,8 @@ fn define_simd(
|
|||||||
for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] {
|
for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] {
|
||||||
let inst = *inst;
|
let inst = *inst;
|
||||||
let template = recipe.opcodes(*opcodes);
|
let template = recipe.opcodes(*opcodes);
|
||||||
e.enc32_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
|
e.enc_both_inferred_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
|
||||||
// REX-less encoding must come after REX encoding so we don't use it by
|
e.enc64_maybe_isap(inst.bind(I64), template.infer_rex(), isap);
|
||||||
// default. Otherwise reg-alloc would never use r8 and up.
|
|
||||||
e.enc64_maybe_isap(inst.clone().bind(I32), template.clone().rex(), isap);
|
|
||||||
e.enc64_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
|
|
||||||
// Similar to above; TODO some of this duplication can be cleaned up by infer_rex()
|
|
||||||
// tracked in https://github.com/bytecodealliance/cranelift/issues/1090
|
|
||||||
e.enc64_maybe_isap(inst.clone().bind(I64), template.clone().rex(), isap);
|
|
||||||
e.enc64_maybe_isap(inst.bind(I64), template, isap);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2087,7 +2087,7 @@ pub(crate) fn define<'shared>(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// XX /r float load with 8-bit offset.
|
// XX /r float load with 8-bit offset.
|
||||||
recipes.add_template_recipe(
|
recipes.add_template_inferred(
|
||||||
EncodingRecipeBuilder::new("fldDisp8", &formats.load, 2)
|
EncodingRecipeBuilder::new("fldDisp8", &formats.load, 2)
|
||||||
.operands_in(vec![gpr])
|
.operands_in(vec![gpr])
|
||||||
.operands_out(vec![fpr])
|
.operands_out(vec![fpr])
|
||||||
@@ -2110,6 +2110,7 @@ pub(crate) fn define<'shared>(
|
|||||||
sink.put1(offset as u8);
|
sink.put1(offset as u8);
|
||||||
"#,
|
"#,
|
||||||
),
|
),
|
||||||
|
"size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0",
|
||||||
);
|
);
|
||||||
|
|
||||||
let has_big_offset =
|
let has_big_offset =
|
||||||
@@ -2142,7 +2143,7 @@ pub(crate) fn define<'shared>(
|
|||||||
);
|
);
|
||||||
|
|
||||||
// XX /r float load with 32-bit offset.
|
// XX /r float load with 32-bit offset.
|
||||||
recipes.add_template_recipe(
|
recipes.add_template_inferred(
|
||||||
EncodingRecipeBuilder::new("fldDisp32", &formats.load, 5)
|
EncodingRecipeBuilder::new("fldDisp32", &formats.load, 5)
|
||||||
.operands_in(vec![gpr])
|
.operands_in(vec![gpr])
|
||||||
.operands_out(vec![fpr])
|
.operands_out(vec![fpr])
|
||||||
@@ -2165,6 +2166,7 @@ pub(crate) fn define<'shared>(
|
|||||||
sink.put4(offset as u32);
|
sink.put4(offset as u32);
|
||||||
"#,
|
"#,
|
||||||
),
|
),
|
||||||
|
"size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -156,6 +156,22 @@ fn size_plus_maybe_sib_or_offset_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0(
|
|||||||
+ if needs_rex { 1 } else { 0 }
|
+ if needs_rex { 1 } else { 0 }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Calculates the size while inferring if the first input register (inreg0) and first output
|
||||||
|
/// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a
|
||||||
|
/// SIB.
|
||||||
|
fn size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0(
|
||||||
|
sizing: &RecipeSizing,
|
||||||
|
enc: Encoding,
|
||||||
|
inst: Inst,
|
||||||
|
divert: &RegDiversions,
|
||||||
|
func: &Function,
|
||||||
|
) -> u8 {
|
||||||
|
let needs_rex = (EncodingBits::from(enc.bits()).rex_w() != 0)
|
||||||
|
|| test_input(0, inst, divert, func, is_extended_reg)
|
||||||
|
|| test_result(0, inst, divert, func, is_extended_reg);
|
||||||
|
size_plus_maybe_sib_for_inreg_0(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 }
|
||||||
|
}
|
||||||
|
|
||||||
/// Infers whether a dynamic REX prefix will be emitted, for use with one input reg.
|
/// Infers whether a dynamic REX prefix will be emitted, for use with one input reg.
|
||||||
///
|
///
|
||||||
/// A REX prefix is known to be emitted if either:
|
/// A REX prefix is known to be emitted if either:
|
||||||
|
|||||||
@@ -40,29 +40,29 @@ block0:
|
|||||||
function %uload_extend() {
|
function %uload_extend() {
|
||||||
block0:
|
block0:
|
||||||
[-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef
|
[-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef
|
||||||
[-,%xmm2] v3 = uload8x8 v1+0 ; bin: heap_oob 66 40 0f 38 30 12
|
[-,%xmm2] v3 = uload8x8 v1+0 ; bin: heap_oob 66 0f 38 30 12
|
||||||
[-,%xmm2] v4 = uload8x8 v1+20 ; bin: heap_oob 66 40 0f 38 30 52 14
|
[-,%xmm2] v4 = uload8x8 v1+20 ; bin: heap_oob 66 0f 38 30 52 14
|
||||||
[-,%xmm2] v5 = uload8x8 v1+256 ; bin: heap_oob 66 40 0f 38 30 92 00000100
|
[-,%xmm2] v5 = uload8x8 v1+256 ; bin: heap_oob 66 0f 38 30 92 00000100
|
||||||
[-,%xmm2] v6 = uload16x4 v1+0 ; bin: heap_oob 66 40 0f 38 33 12
|
[-,%xmm2] v6 = uload16x4 v1+0 ; bin: heap_oob 66 0f 38 33 12
|
||||||
[-,%xmm2] v7 = uload16x4 v1+20 ; bin: heap_oob 66 40 0f 38 33 52 14
|
[-,%xmm2] v7 = uload16x4 v1+20 ; bin: heap_oob 66 0f 38 33 52 14
|
||||||
[-,%xmm2] v8 = uload16x4 v1+256 ; bin: heap_oob 66 40 0f 38 33 92 00000100
|
[-,%xmm2] v8 = uload16x4 v1+256 ; bin: heap_oob 66 0f 38 33 92 00000100
|
||||||
[-,%xmm2] v9 = uload32x2 v1+0 ; bin: heap_oob 66 40 0f 38 35 12
|
[-,%xmm10] v9 = uload32x2 v1+0 ; bin: heap_oob 66 44 0f 38 35 12
|
||||||
[-,%xmm2] v10 = uload32x2 v1+20 ; bin: heap_oob 66 40 0f 38 35 52 14
|
[-,%xmm10] v10 = uload32x2 v1+20 ; bin: heap_oob 66 44 0f 38 35 52 14
|
||||||
[-,%xmm2] v11 = uload32x2 v1+256 ; bin: heap_oob 66 40 0f 38 35 92 00000100
|
[-,%xmm10] v11 = uload32x2 v1+256 ; bin: heap_oob 66 44 0f 38 35 92 00000100
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
function %sload_extend() {
|
function %sload_extend() {
|
||||||
block0:
|
block0:
|
||||||
[-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef
|
[-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef
|
||||||
[-,%xmm2] v3 = sload8x8 v1+0 ; bin: heap_oob 66 40 0f 38 20 12
|
[-,%xmm2] v3 = sload8x8 v1+0 ; bin: heap_oob 66 0f 38 20 12
|
||||||
[-,%xmm2] v4 = sload8x8 v1+20 ; bin: heap_oob 66 40 0f 38 20 52 14
|
[-,%xmm2] v4 = sload8x8 v1+20 ; bin: heap_oob 66 0f 38 20 52 14
|
||||||
[-,%xmm2] v5 = sload8x8 v1+256 ; bin: heap_oob 66 40 0f 38 20 92 00000100
|
[-,%xmm2] v5 = sload8x8 v1+256 ; bin: heap_oob 66 0f 38 20 92 00000100
|
||||||
[-,%xmm2] v6 = sload16x4 v1+0 ; bin: heap_oob 66 40 0f 38 23 12
|
[-,%xmm10] v6 = sload16x4 v1+0 ; bin: heap_oob 66 44 0f 38 23 12
|
||||||
[-,%xmm2] v7 = sload16x4 v1+20 ; bin: heap_oob 66 40 0f 38 23 52 14
|
[-,%xmm10] v7 = sload16x4 v1+20 ; bin: heap_oob 66 44 0f 38 23 52 14
|
||||||
[-,%xmm2] v8 = sload16x4 v1+256 ; bin: heap_oob 66 40 0f 38 23 92 00000100
|
[-,%xmm10] v8 = sload16x4 v1+256 ; bin: heap_oob 66 44 0f 38 23 92 00000100
|
||||||
[-,%xmm2] v9 = sload32x2 v1+0 ; bin: heap_oob 66 40 0f 38 25 12
|
[-,%xmm2] v9 = sload32x2 v1+0 ; bin: heap_oob 66 0f 38 25 12
|
||||||
[-,%xmm2] v10 = sload32x2 v1+20 ; bin: heap_oob 66 40 0f 38 25 52 14
|
[-,%xmm2] v10 = sload32x2 v1+20 ; bin: heap_oob 66 0f 38 25 52 14
|
||||||
[-,%xmm2] v11 = sload32x2 v1+256 ; bin: heap_oob 66 40 0f 38 25 92 00000100
|
[-,%xmm2] v11 = sload32x2 v1+256 ; bin: heap_oob 66 0f 38 25 92 00000100
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user