Avoid extra register movement when lowering the x86 extractlane of a float vector
This commit is based on the assumption that floats are already stored in XMM registers in x86. When extracting a lane, cranelift was moving the float to a regular register and back to an XMM register; this change avoids this by shuffling the float value to the lowest bits of the XMM register. It also assumes that the upper bits can be left as is (instead of zeroing them out).
This commit is contained in:
@@ -356,7 +356,6 @@ pub(crate) fn define(
|
|||||||
let copy_to_ssa = shared.by_name("copy_to_ssa");
|
let copy_to_ssa = shared.by_name("copy_to_ssa");
|
||||||
let ctz = shared.by_name("ctz");
|
let ctz = shared.by_name("ctz");
|
||||||
let debugtrap = shared.by_name("debugtrap");
|
let debugtrap = shared.by_name("debugtrap");
|
||||||
let extractlane = shared.by_name("extractlane");
|
|
||||||
let f32const = shared.by_name("f32const");
|
let f32const = shared.by_name("f32const");
|
||||||
let f64const = shared.by_name("f64const");
|
let f64const = shared.by_name("f64const");
|
||||||
let fadd = shared.by_name("fadd");
|
let fadd = shared.by_name("fadd");
|
||||||
@@ -460,6 +459,7 @@ pub(crate) fn define(
|
|||||||
let x86_fmax = x86.by_name("x86_fmax");
|
let x86_fmax = x86.by_name("x86_fmax");
|
||||||
let x86_fmin = x86.by_name("x86_fmin");
|
let x86_fmin = x86.by_name("x86_fmin");
|
||||||
let x86_pop = x86.by_name("x86_pop");
|
let x86_pop = x86.by_name("x86_pop");
|
||||||
|
let x86_pextr = x86.by_name("x86_pextr");
|
||||||
let x86_pshufd = x86.by_name("x86_pshufd");
|
let x86_pshufd = x86.by_name("x86_pshufd");
|
||||||
let x86_pshufb = x86.by_name("x86_pshufb");
|
let x86_pshufb = x86.by_name("x86_pshufb");
|
||||||
let x86_push = x86.by_name("x86_push");
|
let x86_push = x86.by_name("x86_push");
|
||||||
@@ -1791,16 +1791,16 @@ pub(crate) fn define(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// SIMD extractlane
|
// SIMD extractlane
|
||||||
let mut extractlane_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
|
let mut x86_pextr_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
|
||||||
HashMap::new();
|
HashMap::new();
|
||||||
extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB
|
x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41))); // PEXTRB
|
||||||
extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
|
x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes
|
||||||
extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD
|
x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRD
|
||||||
extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64
|
x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRQ, only x86_64
|
||||||
|
|
||||||
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
|
||||||
if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
|
if let Some((opcode, isap)) = x86_pextr_mapping.get(&ty.lane_bits()) {
|
||||||
let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
|
let instruction = x86_pextr.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
|
let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
|
||||||
if ty.lane_bits() < 64 {
|
if ty.lane_bits() < 64 {
|
||||||
e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
|
e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone());
|
||||||
|
|||||||
@@ -291,5 +291,22 @@ pub(crate) fn define(
|
|||||||
.operands_out(vec![a]),
|
.operands_out(vec![a]),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let Idx = &operand_doc("Idx", uimm8, "Lane index");
|
||||||
|
let x = &operand("x", TxN);
|
||||||
|
let a = &operand("a", &TxN.lane_of());
|
||||||
|
|
||||||
|
ig.push(
|
||||||
|
Inst::new(
|
||||||
|
"x86_pextr",
|
||||||
|
r#"
|
||||||
|
Extract lane ``Idx`` from ``x``.
|
||||||
|
The lane index, ``Idx``, is an immediate value, not an SSA value. It
|
||||||
|
must indicate a valid lane index for the type of ``x``.
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.operands_in(vec![x, Idx])
|
||||||
|
.operands_out(vec![a]),
|
||||||
|
);
|
||||||
|
|
||||||
ig.build()
|
ig.build()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
let bor = insts.by_name("bor");
|
let bor = insts.by_name("bor");
|
||||||
let clz = insts.by_name("clz");
|
let clz = insts.by_name("clz");
|
||||||
let ctz = insts.by_name("ctz");
|
let ctz = insts.by_name("ctz");
|
||||||
|
let extractlane = insts.by_name("extractlane");
|
||||||
let f64const = insts.by_name("f64const");
|
let f64const = insts.by_name("f64const");
|
||||||
let fcmp = insts.by_name("fcmp");
|
let fcmp = insts.by_name("fcmp");
|
||||||
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
|
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
|
||||||
@@ -379,5 +380,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
narrow.custom_legalize(extractlane, "convert_extractlane");
|
||||||
|
|
||||||
narrow.build_and_add_to(&mut shared.transform_groups);
|
narrow.build_and_add_to(&mut shared.transform_groups);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use crate::bitset::BitSet;
|
|||||||
use crate::cursor::{Cursor, FuncCursor};
|
use crate::cursor::{Cursor, FuncCursor};
|
||||||
use crate::flowgraph::ControlFlowGraph;
|
use crate::flowgraph::ControlFlowGraph;
|
||||||
use crate::ir::condcodes::{FloatCC, IntCC};
|
use crate::ir::condcodes::{FloatCC, IntCC};
|
||||||
|
use crate::ir::types::*;
|
||||||
use crate::ir::{self, Function, Inst, InstBuilder};
|
use crate::ir::{self, Function, Inst, InstBuilder};
|
||||||
use crate::isa::constraints::*;
|
use crate::isa::constraints::*;
|
||||||
use crate::isa::enc_tables::*;
|
use crate::isa::enc_tables::*;
|
||||||
@@ -893,3 +894,59 @@ fn expand_fcvt_to_uint_sat(
|
|||||||
cfg.recompute_ebb(pos.func, uint_large_ebb);
|
cfg.recompute_ebb(pos.func, uint_large_ebb);
|
||||||
cfg.recompute_ebb(pos.func, done);
|
cfg.recompute_ebb(pos.func, done);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF
|
||||||
|
/// extractlane instruction
|
||||||
|
fn convert_extractlane(
|
||||||
|
inst: ir::Inst,
|
||||||
|
func: &mut ir::Function,
|
||||||
|
_cfg: &mut ControlFlowGraph,
|
||||||
|
_isa: &dyn TargetIsa,
|
||||||
|
) {
|
||||||
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
||||||
|
pos.use_srcloc(inst);
|
||||||
|
|
||||||
|
if let ir::InstructionData::ExtractLane {
|
||||||
|
opcode: ir::Opcode::Extractlane,
|
||||||
|
arg,
|
||||||
|
lane,
|
||||||
|
} = pos.func.dfg[inst]
|
||||||
|
{
|
||||||
|
// NOTE: the following legalization assumes that the upper bits of the XMM register do
|
||||||
|
// not need to be zeroed during extractlane.
|
||||||
|
let value_type = pos.func.dfg.value_type(arg);
|
||||||
|
if value_type.lane_type().is_float() {
|
||||||
|
// Floats are already in XMM registers and can stay there.
|
||||||
|
let shuffled = if lane != 0 {
|
||||||
|
// Replace the extractlane with a PSHUFD to get the float in the right place.
|
||||||
|
match value_type {
|
||||||
|
F32X4 => {
|
||||||
|
// Move the selected lane to the 0 lane.
|
||||||
|
let shuffle_mask: u8 = 0b00_00_00_00 | lane;
|
||||||
|
pos.ins().x86_pshufd(arg, shuffle_mask)
|
||||||
|
}
|
||||||
|
F64X2 => {
|
||||||
|
assert_eq!(lane, 1);
|
||||||
|
// Because we know the lane == 1, we move the upper 64 bits to the lower
|
||||||
|
// 64 bits, leaving the top 64 bits as-is.
|
||||||
|
let shuffle_mask = 0b11_10_11_10;
|
||||||
|
let bitcast = pos.ins().raw_bitcast(F32X4, arg);
|
||||||
|
pos.ins().x86_pshufd(bitcast, shuffle_mask)
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Remove the extractlane instruction, leaving the float where it is.
|
||||||
|
arg
|
||||||
|
};
|
||||||
|
// Then we must bitcast to the right type.
|
||||||
|
pos.func
|
||||||
|
.dfg
|
||||||
|
.replace(inst)
|
||||||
|
.raw_bitcast(value_type.lane_type(), shuffled);
|
||||||
|
} else {
|
||||||
|
// For non-floats, lower with the usual PEXTR* instruction.
|
||||||
|
pos.func.dfg.replace(inst).x86_pextr(arg, lane);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,38 @@
|
|||||||
|
test binemit
|
||||||
|
set enable_simd
|
||||||
|
target x86_64 haswell
|
||||||
|
|
||||||
|
; for extractlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pextr
|
||||||
|
; which is manually placed in the IR so that it can be binemit-tested
|
||||||
|
|
||||||
|
function %test_extractlane_b8() {
|
||||||
|
ebb0:
|
||||||
|
[-, %rax] v0 = bconst.b8 true
|
||||||
|
[-, %xmm0] v1 = splat.b8x16 v0
|
||||||
|
[-, %rax] v2 = x86_pextr v1, 10 ; bin: 66 0f 3a 14 c0 0a
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
function %test_extractlane_i16() {
|
||||||
|
ebb0:
|
||||||
|
[-, %rax] v0 = iconst.i16 4
|
||||||
|
[-, %xmm1] v1 = splat.i16x8 v0
|
||||||
|
[-, %rax] v2 = x86_pextr v1, 4 ; bin: 66 0f c5 c8 04
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
function %test_extractlane_i32() {
|
||||||
|
ebb0:
|
||||||
|
[-, %rax] v0 = iconst.i32 42
|
||||||
|
[-, %xmm4] v1 = splat.i32x4 v0
|
||||||
|
[-, %rcx] v2 = x86_pextr v1, 2 ; bin: 66 0f 3a 16 e1 02
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
function %test_extractlane_b64() {
|
||||||
|
ebb0:
|
||||||
|
[-, %rax] v0 = bconst.b64 false
|
||||||
|
[-, %xmm2] v1 = splat.b64x2 v0
|
||||||
|
[-, %rbx] v2 = x86_pextr v1, 1 ; bin: 66 48 0f 3a 16 d3 01
|
||||||
|
return
|
||||||
|
}
|
||||||
31
cranelift/filetests/filetests/isa/x86/extractlane-run.clif
Normal file
31
cranelift/filetests/filetests/isa/x86/extractlane-run.clif
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
test run
|
||||||
|
set enable_simd
|
||||||
|
|
||||||
|
function %test_extractlane_b8() -> b8 {
|
||||||
|
ebb0:
|
||||||
|
v1 = vconst.b8x16 [false false false false false false false false false false true false false
|
||||||
|
false false false]
|
||||||
|
v2 = extractlane v1, 10
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|
||||||
|
function %test_extractlane_i16() -> b1 {
|
||||||
|
ebb0:
|
||||||
|
v0 = vconst.i16x8 0x00080007000600050004000300020001
|
||||||
|
v1 = extractlane v0, 1
|
||||||
|
v2 = icmp_imm eq v1, 2
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|
||||||
|
function %test_extractlane_f32() -> b1 {
|
||||||
|
ebb0:
|
||||||
|
v0 = f32const 0x42.42
|
||||||
|
v1 = vconst.f32x4 [0x00.00 0x00.00 0x00.00 0x42.42]
|
||||||
|
v2 = extractlane v1, 3
|
||||||
|
v10 = f32const 0x42.42 ; TODO this should not be necessary, v0 should be re-usable
|
||||||
|
v3 = fcmp eq v2, v10
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
; run
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
test binemit
|
|
||||||
set enable_simd
|
|
||||||
target x86_64 haswell
|
|
||||||
|
|
||||||
function %test_extractlane_b8() {
|
|
||||||
ebb0:
|
|
||||||
[-, %rax] v0 = bconst.b8 true
|
|
||||||
[-, %xmm0] v1 = splat.b8x16 v0
|
|
||||||
[-, %rax] v2 = extractlane v1, 10 ; bin: 66 0f 3a 14 c0 0a
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
function %test_extractlane_i16() {
|
|
||||||
ebb0:
|
|
||||||
[-, %rax] v0 = iconst.i16 4
|
|
||||||
[-, %xmm1] v1 = splat.i16x8 v0
|
|
||||||
[-, %rax] v2 = extractlane v1, 4 ; bin: 66 0f c5 c8 04
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
function %test_extractlane_i32() {
|
|
||||||
ebb0:
|
|
||||||
[-, %rax] v0 = iconst.i32 42
|
|
||||||
[-, %xmm4] v1 = splat.i32x4 v0
|
|
||||||
[-, %rcx] v2 = extractlane v1, 2 ; bin: 66 0f 3a 16 e1 02
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
function %test_extractlane_f64() {
|
|
||||||
ebb0:
|
|
||||||
[-, %rax] v0 = f64const 0x0.0
|
|
||||||
[-, %xmm2] v1 = splat.f64x2 v0
|
|
||||||
[-, %rbx] v2 = extractlane v1, 1 ; bin: 66 48 0f 3a 16 d3 01
|
|
||||||
return
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user