diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 4a04ef574d..30246b85ad 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -356,7 +356,6 @@ pub(crate) fn define( let copy_to_ssa = shared.by_name("copy_to_ssa"); let ctz = shared.by_name("ctz"); let debugtrap = shared.by_name("debugtrap"); - let extractlane = shared.by_name("extractlane"); let f32const = shared.by_name("f32const"); let f64const = shared.by_name("f64const"); let fadd = shared.by_name("fadd"); @@ -460,6 +459,7 @@ pub(crate) fn define( let x86_fmax = x86.by_name("x86_fmax"); let x86_fmin = x86.by_name("x86_fmin"); let x86_pop = x86.by_name("x86_pop"); + let x86_pextr = x86.by_name("x86_pextr"); let x86_pshufd = x86.by_name("x86_pshufd"); let x86_pshufb = x86.by_name("x86_pshufb"); let x86_push = x86.by_name("x86_push"); @@ -1791,16 +1791,16 @@ pub(crate) fn define( } // SIMD extractlane - let mut extractlane_mapping: HashMap, Option)> = + let mut x86_pextr_mapping: HashMap, Option)> = HashMap::new(); - extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB - extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes - extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD - extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64 + x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41))); // PEXTRB + x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes + x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRD + x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRQ, only x86_64 for ty in ValueType::all_lane_types().filter(allowed_simd_type) { - if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) { - let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size); + if let Some((opcode, isap)) = x86_pextr_mapping.get(&ty.lane_bits()) { + let instruction = x86_pextr.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone()); if ty.lane_bits() < 64 { e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone()); diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 03730cdeac..3f583c6edb 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -291,5 +291,22 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let Idx = &operand_doc("Idx", uimm8, "Lane index"); + let x = &operand("x", TxN); + let a = &operand("a", &TxN.lane_of()); + + ig.push( + Inst::new( + "x86_pextr", + r#" + Extract lane ``Idx`` from ``x``. + The lane index, ``Idx``, is an immediate value, not an SSA value. It + must indicate a valid lane index for the type of ``x``. + "#, + ) + .operands_in(vec![x, Idx]) + .operands_out(vec![a]), + ); + ig.build() } diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index d56beb8022..4c2ebaefd4 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -23,6 +23,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct let bor = insts.by_name("bor"); let clz = insts.by_name("clz"); let ctz = insts.by_name("ctz"); + let extractlane = insts.by_name("extractlane"); let f64const = insts.by_name("f64const"); let fcmp = insts.by_name("fcmp"); let fcvt_from_uint = insts.by_name("fcvt_from_uint"); @@ -379,5 +380,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct ); } + narrow.custom_legalize(extractlane, "convert_extractlane"); + narrow.build_and_add_to(&mut shared.transform_groups); } diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index e0fc05178d..39bdb57845 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -5,6 +5,7 @@ use crate::bitset::BitSet; use crate::cursor::{Cursor, FuncCursor}; use crate::flowgraph::ControlFlowGraph; use crate::ir::condcodes::{FloatCC, IntCC}; +use crate::ir::types::*; use crate::ir::{self, Function, Inst, InstBuilder}; use crate::isa::constraints::*; use crate::isa::enc_tables::*; @@ -893,3 +894,59 @@ fn expand_fcvt_to_uint_sat( cfg.recompute_ebb(pos.func, uint_large_ebb); cfg.recompute_ebb(pos.func, done); } + +/// Because floats already exist in XMM registers, we can keep them there when executing a CLIF +/// extractlane instruction +fn convert_extractlane( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::ExtractLane { + opcode: ir::Opcode::Extractlane, + arg, + lane, + } = pos.func.dfg[inst] + { + // NOTE: the following legalization assumes that the upper bits of the XMM register do + // not need to be zeroed during extractlane. + let value_type = pos.func.dfg.value_type(arg); + if value_type.lane_type().is_float() { + // Floats are already in XMM registers and can stay there. + let shuffled = if lane != 0 { + // Replace the extractlane with a PSHUFD to get the float in the right place. + match value_type { + F32X4 => { + // Move the selected lane to the 0 lane. + let shuffle_mask: u8 = 0b00_00_00_00 | lane; + pos.ins().x86_pshufd(arg, shuffle_mask) + } + F64X2 => { + assert_eq!(lane, 1); + // Because we know the lane == 1, we move the upper 64 bits to the lower + // 64 bits, leaving the top 64 bits as-is. + let shuffle_mask = 0b11_10_11_10; + let bitcast = pos.ins().raw_bitcast(F32X4, arg); + pos.ins().x86_pshufd(bitcast, shuffle_mask) + } + _ => unreachable!(), + } + } else { + // Remove the extractlane instruction, leaving the float where it is. + arg + }; + // Then we must bitcast to the right type. + pos.func + .dfg + .replace(inst) + .raw_bitcast(value_type.lane_type(), shuffled); + } else { + // For non-floats, lower with the usual PEXTR* instruction. + pos.func.dfg.replace(inst).x86_pextr(arg, lane); + } + } +} diff --git a/cranelift/filetests/filetests/isa/x86/extractlane-binemit.clif b/cranelift/filetests/filetests/isa/x86/extractlane-binemit.clif new file mode 100644 index 0000000000..0a3b776a99 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/extractlane-binemit.clif @@ -0,0 +1,38 @@ +test binemit +set enable_simd +target x86_64 haswell + +; for extractlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pextr +; which is manually placed in the IR so that it can be binemit-tested + +function %test_extractlane_b8() { +ebb0: +[-, %rax] v0 = bconst.b8 true +[-, %xmm0] v1 = splat.b8x16 v0 +[-, %rax] v2 = x86_pextr v1, 10 ; bin: 66 0f 3a 14 c0 0a + return +} + +function %test_extractlane_i16() { +ebb0: +[-, %rax] v0 = iconst.i16 4 +[-, %xmm1] v1 = splat.i16x8 v0 +[-, %rax] v2 = x86_pextr v1, 4 ; bin: 66 0f c5 c8 04 + return +} + +function %test_extractlane_i32() { +ebb0: +[-, %rax] v0 = iconst.i32 42 +[-, %xmm4] v1 = splat.i32x4 v0 +[-, %rcx] v2 = x86_pextr v1, 2 ; bin: 66 0f 3a 16 e1 02 + return +} + +function %test_extractlane_b64() { +ebb0: +[-, %rax] v0 = bconst.b64 false +[-, %xmm2] v1 = splat.b64x2 v0 +[-, %rbx] v2 = x86_pextr v1, 1 ; bin: 66 48 0f 3a 16 d3 01 + return +} diff --git a/cranelift/filetests/filetests/isa/x86/extractlane-run.clif b/cranelift/filetests/filetests/isa/x86/extractlane-run.clif new file mode 100644 index 0000000000..ce8c00a933 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/extractlane-run.clif @@ -0,0 +1,31 @@ +test run +set enable_simd + +function %test_extractlane_b8() -> b8 { +ebb0: + v1 = vconst.b8x16 [false false false false false false false false false false true false false + false false false] + v2 = extractlane v1, 10 + return v2 +} +; run + +function %test_extractlane_i16() -> b1 { +ebb0: + v0 = vconst.i16x8 0x00080007000600050004000300020001 + v1 = extractlane v0, 1 + v2 = icmp_imm eq v1, 2 + return v2 +} +; run + +function %test_extractlane_f32() -> b1 { +ebb0: + v0 = f32const 0x42.42 + v1 = vconst.f32x4 [0x00.00 0x00.00 0x00.00 0x42.42] + v2 = extractlane v1, 3 + v10 = f32const 0x42.42 ; TODO this should not be necessary, v0 should be re-usable + v3 = fcmp eq v2, v10 + return v3 +} +; run diff --git a/cranelift/filetests/filetests/isa/x86/extractlane.clif b/cranelift/filetests/filetests/isa/x86/extractlane.clif deleted file mode 100644 index e7a1ea898e..0000000000 --- a/cranelift/filetests/filetests/isa/x86/extractlane.clif +++ /dev/null @@ -1,35 +0,0 @@ -test binemit -set enable_simd -target x86_64 haswell - -function %test_extractlane_b8() { -ebb0: -[-, %rax] v0 = bconst.b8 true -[-, %xmm0] v1 = splat.b8x16 v0 -[-, %rax] v2 = extractlane v1, 10 ; bin: 66 0f 3a 14 c0 0a - return -} - -function %test_extractlane_i16() { -ebb0: -[-, %rax] v0 = iconst.i16 4 -[-, %xmm1] v1 = splat.i16x8 v0 -[-, %rax] v2 = extractlane v1, 4 ; bin: 66 0f c5 c8 04 - return -} - -function %test_extractlane_i32() { -ebb0: -[-, %rax] v0 = iconst.i32 42 -[-, %xmm4] v1 = splat.i32x4 v0 -[-, %rcx] v2 = extractlane v1, 2 ; bin: 66 0f 3a 16 e1 02 - return -} - -function %test_extractlane_f64() { -ebb0: -[-, %rax] v0 = f64const 0x0.0 -[-, %xmm2] v1 = splat.f64x2 v0 -[-, %rbx] v2 = extractlane v1, 1 ; bin: 66 48 0f 3a 16 d3 01 - return -}