diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index b95705f9bc..14b3c0eea9 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -396,7 +396,6 @@ pub(crate) fn define( let ifcmp_sp = shared.by_name("ifcmp_sp"); let imul = shared.by_name("imul"); let indirect_jump_table_br = shared.by_name("indirect_jump_table_br"); - let insertlane = shared.by_name("insertlane"); let ireduce = shared.by_name("ireduce"); let ishl = shared.by_name("ishl"); let ishl_imm = shared.by_name("ishl_imm"); @@ -469,8 +468,12 @@ pub(crate) fn define( let x86_cvtt2si = x86.by_name("x86_cvtt2si"); let x86_fmax = x86.by_name("x86_fmax"); let x86_fmin = x86.by_name("x86_fmin"); + let x86_insertps = x86.by_name("x86_insertps"); + let x86_movlhps = x86.by_name("x86_movlhps"); + let x86_movsd = x86.by_name("x86_movsd"); let x86_pop = x86.by_name("x86_pop"); let x86_pextr = x86.by_name("x86_pextr"); + let x86_pinsr = x86.by_name("x86_pinsr"); let x86_pshufd = x86.by_name("x86_pshufd"); let x86_pshufb = x86.by_name("x86_pshufb"); let x86_push = x86.by_name("x86_push"); @@ -501,6 +504,7 @@ pub(crate) fn define( let rec_f64imm_z = r.template("f64imm_z"); let rec_fa = r.template("fa"); let rec_fax = r.template("fax"); + let rec_fa_ib = r.template("fa_ib"); let rec_fcmp = r.template("fcmp"); let rec_fcscc = r.template("fcscc"); let rec_ffillnull = r.recipe("ffillnull"); @@ -1785,16 +1789,16 @@ pub(crate) fn define( } // SIMD insertlane - let mut insertlane_mapping: HashMap, Option)> = + let mut x86_pinsr_mapping: HashMap, Option)> = HashMap::new(); - insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB - insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2 - insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD - insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64 + x86_pinsr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], Some(use_sse41_simd))); // PINSRB + x86_pinsr_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], None)); // PINSRW from SSE2 + x86_pinsr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRD + x86_pinsr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], Some(use_sse41_simd))); // PINSRQ, only x86_64 for ty in ValueType::all_lane_types().filter(allowed_simd_type) { - if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) { - let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size); + if let Some((opcode, isap)) = x86_pinsr_mapping.get(&ty.lane_bits()) { + let instruction = x86_pinsr.bind_vector_from_lane(ty, sse_vector_size); let template = rec_r_ib_unsigned_r.opcodes(opcode.clone()); if ty.lane_bits() < 64 { e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap.clone()); @@ -1805,13 +1809,34 @@ pub(crate) fn define( } } + // for legalizing insertlane with floats, INSERTPS from SSE4.1 + { + let instruction = x86_insertps.bind_vector_from_lane(F32, sse_vector_size); + let template = rec_fa_ib.nonrex().opcodes(vec![0x66, 0x0f, 0x3a, 0x21]); + e.enc_32_64_maybe_isap(instruction, template, Some(use_sse41_simd)); + } + + // for legalizing insertlane with floats, MOVSD from SSE2 + { + let instruction = x86_movsd.bind_vector_from_lane(F64, sse_vector_size); + let template = rec_fa.nonrex().opcodes(vec![0xf2, 0x0f, 0x10]); + e.enc_32_64_maybe_isap(instruction, template, None); // from SSE2 + } + + // for legalizing insertlane with floats, MOVLHPS from SSE + { + let instruction = x86_movlhps.bind_vector_from_lane(F64, sse_vector_size); + let template = rec_fa.nonrex().opcodes(vec![0x0f, 0x16]); + e.enc_32_64_maybe_isap(instruction, template, None); // from SSE + } + // SIMD extractlane let mut x86_pextr_mapping: HashMap, Option)> = HashMap::new(); - x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41))); // PEXTRB - x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from zSSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes - x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRD - x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41))); // PEXTRQ, only x86_64 + x86_pextr_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], Some(use_sse41_simd))); // PEXTRB + x86_pextr_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], None)); // PEXTRW from SSE2, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes + x86_pextr_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRD + x86_pextr_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], Some(use_sse41_simd))); // PEXTRQ, only x86_64 for ty in ValueType::all_lane_types().filter(allowed_simd_type) { if let Some((opcode, isap)) = x86_pextr_mapping.get(&ty.lane_bits()) { diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 3f583c6edb..b9f2496a85 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -308,5 +308,84 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let IBxN = &TypeVar::new( + "IBxN", + "A SIMD vector type containing only booleans and integers", + TypeSetBuilder::new() + .ints(Interval::All) + .bools(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let x = &operand("x", IBxN); + let y = &operand_doc("y", &IBxN.lane_of(), "New lane value"); + let a = &operand("a", IBxN); + + ig.push( + Inst::new( + "x86_pinsr", + r#" + Insert ``y`` into ``x`` at lane ``Idx``. + The lane index, ``Idx``, is an immediate value, not an SSA value. It + must indicate a valid lane index for the type of ``x``. + "#, + ) + .operands_in(vec![x, Idx, y]) + .operands_out(vec![a]), + ); + + let FxN = &TypeVar::new( + "FxN", + "A SIMD vector type containing floats", + TypeSetBuilder::new() + .floats(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let x = &operand("x", FxN); + let y = &operand_doc("y", &FxN.lane_of(), "New lane value"); + let a = &operand("a", FxN); + + ig.push( + Inst::new( + "x86_insertps", + r#" + Insert a lane of ``y`` into ``x`` at using ``Idx`` to encode both which lane the value is + extracted from and which it is inserted to. This is similar to x86_pinsr but inserts + floats, which are already stored in an XMM register. + "#, + ) + .operands_in(vec![x, Idx, y]) + .operands_out(vec![a]), + ); + + let x = &operand("x", FxN); + let y = &operand("y", FxN); + let a = &operand("a", FxN); + + ig.push( + Inst::new( + "x86_movsd", + r#" + Move the low 64 bits of the float vector ``y`` to the low 64 bits of float vector ``x`` + "#, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_movlhps", + r#" + Move the low 64 bits of the float vector ``y`` to the high 64 bits of float vector ``x`` + "#, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + ig.build() } diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 4c2ebaefd4..555a93f9cb 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -381,6 +381,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct } narrow.custom_legalize(extractlane, "convert_extractlane"); + narrow.custom_legalize(insertlane, "convert_insertlane"); narrow.build_and_add_to(&mut shared.transform_groups); } diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index 3f14769dee..8176effc42 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -566,6 +566,27 @@ pub(crate) fn define<'shared>( ), ); + // XX /r with FPR ins and outs. A form with a byte immediate. + { + let format = formats.get(f_insert_lane); + recipes.add_template_recipe( + EncodingRecipeBuilder::new("fa_ib", f_insert_lane, 2) + .operands_in(vec![fpr, fpr]) + .operands_out(vec![0]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + format, "lane", 8, 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + let imm:i64 = lane.into(); + sink.put1(imm as u8); + "#, + ), + ); + } + // XX /n for a unary operation with extension bits. recipes.add_template_recipe( EncodingRecipeBuilder::new("ur", f_unary, 1) diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index 39bdb57845..f67d7f0b69 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -950,3 +950,65 @@ fn convert_extractlane( } } } + +/// Because floats exist in XMM registers, we can keep them there when executing a CLIF +/// insertlane instruction +fn convert_insertlane( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::InsertLane { + opcode: ir::Opcode::Insertlane, + args: [vector, replacement], + lane, + } = pos.func.dfg[inst] + { + let value_type = pos.func.dfg.value_type(vector); + if value_type.lane_type().is_float() { + // Floats are already in XMM registers and can stay there. + match value_type { + F32X4 => { + assert!(lane > 0 && lane <= 3); + let immediate = 0b00_00_00_00 | lane << 4; + // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane + // shifted into bits 5:6). + pos.func + .dfg + .replace(inst) + .x86_insertps(vector, immediate, replacement) + } + F64X2 => { + let replacement_as_vector = pos.ins().raw_bitcast(F64X2, replacement); // only necessary due to SSA types + if lane == 0 { + // Move the lowest quadword in replacement to vector without changing + // the upper bits. + pos.func + .dfg + .replace(inst) + .x86_movsd(vector, replacement_as_vector) + } else { + assert_eq!(lane, 1); + // Move the low 64 bits of replacement vector to the high 64 bits of the + // vector. + pos.func + .dfg + .replace(inst) + .x86_movlhps(vector, replacement_as_vector) + } + } + _ => unreachable!(), + }; + } else { + // For non-floats, lower with the usual PINSR* instruction. + pos.func + .dfg + .replace(inst) + .x86_pinsr(vector, lane, replacement); + } + } +} diff --git a/cranelift/codegen/src/verifier/locations.rs b/cranelift/codegen/src/verifier/locations.rs index bf1a4e1860..cf17ae13de 100644 --- a/cranelift/codegen/src/verifier/locations.rs +++ b/cranelift/codegen/src/verifier/locations.rs @@ -107,8 +107,10 @@ impl<'a> LocationVerifier<'a> { fatal!( errors, inst, - "{} constraints not satisfied", - self.encinfo.display(enc) + "{} constraints not satisfied in: {}\n{}", + self.encinfo.display(enc), + self.func.dfg.display_inst(inst, self.isa), + self.func.display(self.isa) ) } diff --git a/cranelift/filetests/filetests/isa/x86/extractlane-run.clif b/cranelift/filetests/filetests/isa/x86/extractlane-run.clif index 4590bd0673..adb2e7b8e6 100644 --- a/cranelift/filetests/filetests/isa/x86/extractlane-run.clif +++ b/cranelift/filetests/filetests/isa/x86/extractlane-run.clif @@ -28,3 +28,41 @@ ebb0: return v3 } ; run + +function %test_extractlane_i32_with_vector_reuse() -> b1 { +ebb0: + v0 = iconst.i32 42 + v1 = iconst.i32 99 + + v2 = splat.i32x4 v0 + v3 = insertlane v2, 2, v1 + + v4 = extractlane v3, 3 + v5 = icmp eq v4, v0 + + v6 = extractlane v3, 2 + v7 = icmp eq v6, v1 + + v8 = band v5, v7 + return v8 +} +; run + +function %test_extractlane_f32_with_vector_reuse() -> b1 { +ebb0: + v0 = f32const 0x42.42 + v1 = f32const 0x99.99 + + v2 = splat.f32x4 v0 + v3 = insertlane v2, 2, v1 + + v4 = extractlane v3, 3 + v5 = fcmp eq v4, v0 + + v6 = extractlane v3, 2 + v7 = fcmp eq v6, v1 + + v8 = band v5, v7 + return v8 +} +; run diff --git a/cranelift/filetests/filetests/isa/x86/insertlane-binemit.clif b/cranelift/filetests/filetests/isa/x86/insertlane-binemit.clif new file mode 100644 index 0000000000..49048130c0 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/insertlane-binemit.clif @@ -0,0 +1,42 @@ +test binemit +set enable_simd +target x86_64 haswell + +; for insertlane, floats are legalized differently than integers and booleans; integers and booleans use x86_pinsr +; which is manually placed in the IR so that it can be binemit-tested + +function %test_insertlane_b8() { +ebb0: +[-, %rax] v0 = bconst.b8 true +[-, %rbx] v1 = bconst.b8 false +[-, %xmm0] v2 = splat.b8x16 v0 +[-, %xmm0] v3 = x86_pinsr v2, 10, v1 ; bin: 66 0f 3a 20 c3 0a + return +} + +function %test_insertlane_i16() { +ebb0: +[-, %rax] v0 = iconst.i16 4 +[-, %rbx] v1 = iconst.i16 5 +[-, %xmm1] v2 = splat.i16x8 v0 +[-, %xmm1] v3 = x86_pinsr v2, 4, v1 ; bin: 66 0f c4 cb 04 + return +} + +function %test_insertlane_i32() { +ebb0: +[-, %rax] v0 = iconst.i32 42 +[-, %rbx] v1 = iconst.i32 99 +[-, %xmm4] v2 = splat.i32x4 v0 +[-, %xmm4] v3 = x86_pinsr v2, 2, v1 ; bin: 66 0f 3a 22 e3 02 + return +} + +function %test_insertlane_b64() { +ebb0: +[-, %rax] v0 = bconst.b64 true +[-, %rbx] v1 = bconst.b64 false +[-, %xmm2] v2 = splat.b64x2 v0 +[-, %xmm2] v3 = x86_pinsr v2, 1, v1 ; bin: 66 48 0f 3a 22 d3 01 + return +} diff --git a/cranelift/filetests/filetests/isa/x86/insertlane-run.clif b/cranelift/filetests/filetests/isa/x86/insertlane-run.clif new file mode 100644 index 0000000000..92fb38202e --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/insertlane-run.clif @@ -0,0 +1,48 @@ +test run +set enable_simd + +; TODO once SIMD vector comparison is implemented, remove use of extractlane below + +function %test_insertlane_b8() -> b8 { +ebb0: + v1 = bconst.b8 true + v2 = vconst.b8x16 [false false false false false false false false false false false false false + false false false] + v3 = insertlane v2, 10, v1 + v4 = extractlane v3, 10 + return v4 +} +; run + +function %test_insertlane_f32() -> b1 { +ebb0: + v0 = f32const 0x42.42 + v1 = vconst.f32x4 0x00 + v2 = insertlane v1, 1, v0 + v3 = extractlane v2, 1 + v4 = fcmp eq v3, v0 + return v4 +} +; run + +function %test_insertlane_f64_lane1() -> b1 { +ebb0: + v0 = f64const 0x42.42 + v1 = vconst.f64x2 0x00 + v2 = insertlane v1, 1, v0 + v3 = extractlane v2, 1 + v4 = fcmp eq v3, v0 + return v4 +} +; run + +function %test_insertlane_f64_lane0() -> b1 { +ebb0: + v0 = f64const 0x42.42 + v1 = vconst.f64x2 0x00 + v2 = insertlane v1, 0, v0 + v3 = extractlane v2, 0 + v4 = fcmp eq v3, v0 + return v4 +} +; run diff --git a/cranelift/filetests/filetests/isa/x86/insertlane.clif b/cranelift/filetests/filetests/isa/x86/insertlane.clif deleted file mode 100644 index c55dc40333..0000000000 --- a/cranelift/filetests/filetests/isa/x86/insertlane.clif +++ /dev/null @@ -1,39 +0,0 @@ -test binemit -set enable_simd -target x86_64 haswell - -function %test_insertlane_b8() { -ebb0: -[-, %rax] v0 = bconst.b8 true -[-, %rbx] v1 = bconst.b8 false -[-, %xmm0] v2 = splat.b8x16 v0 -[-, %xmm0] v3 = insertlane v2, 10, v1 ; bin: 66 0f 3a 20 c3 0a - return -} - -function %test_insertlane_i16() { -ebb0: -[-, %rax] v0 = iconst.i16 4 -[-, %rbx] v1 = iconst.i16 5 -[-, %xmm1] v2 = splat.i16x8 v0 -[-, %xmm1] v3 = insertlane v2, 4, v1 ; bin: 66 0f c4 cb 04 - return -} - -function %test_insertlane_i32() { -ebb0: -[-, %rax] v0 = iconst.i32 42 -[-, %rbx] v1 = iconst.i32 99 -[-, %xmm4] v2 = splat.i32x4 v0 -[-, %xmm4] v3 = insertlane v2, 2, v1 ; bin: 66 0f 3a 22 e3 02 - return -} - -function %test_insertlane_f64() { -ebb0: -[-, %rax] v0 = f64const 0x0.0 -[-, %rbx] v1 = f64const 0x4.2 -[-, %xmm2] v2 = splat.f64x2 v0 -[-, %xmm2] v3 = insertlane v2, 1, v1 ; bin: 66 48 0f 3a 22 d3 01 - return -} diff --git a/cranelift/filetests/filetests/isa/x86/legalize-splat.clif b/cranelift/filetests/filetests/isa/x86/legalize-splat.clif index c0fc83ebe7..19d61d529c 100644 --- a/cranelift/filetests/filetests/isa/x86/legalize-splat.clif +++ b/cranelift/filetests/filetests/isa/x86/legalize-splat.clif @@ -33,7 +33,7 @@ ebb0: ; check: ebb0: ; nextln: v0 = iconst.i64 42 ; nextln: v2 = scalar_to_vector.i64x2 v0 -; nextln: v1 = insertlane v2, 1, v0 +; nextln: v1 = x86_pinsr v2, 1, v0 ; nextln: return v1 @@ -48,7 +48,7 @@ ebb0: ; check: ebb0: ; nextln: v0 = bconst.b16 true ; nextln: v2 = scalar_to_vector.b16x8 v0 -; nextln: v3 = insertlane v2, 1, v0 +; nextln: v3 = x86_pinsr v2, 1, v0 ; nextln: v4 = raw_bitcast.i32x4 v3 ; nextln: v5 = x86_pshufd v4, 0 ; nextln: v1 = raw_bitcast.b16x8 v5