diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 32c1e2f957..788c0c3ec0 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -326,6 +326,7 @@ pub fn define( let ifcmp_sp = shared.by_name("ifcmp_sp"); let imul = shared.by_name("imul"); let indirect_jump_table_br = shared.by_name("indirect_jump_table_br"); + let insertlane = shared.by_name("insertlane"); let ireduce = shared.by_name("ireduce"); let ishl = shared.by_name("ishl"); let ishl_imm = shared.by_name("ishl_imm"); @@ -476,6 +477,7 @@ pub fn define( let rec_ret = r.template("ret"); let rec_r_ib = r.template("r_ib"); let rec_r_ib_unsigned = r.template("r_ib_unsigned"); + let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r"); let rec_r_id = r.template("r_id"); let rec_rcmp = r.template("rcmp"); let rec_rcmp_ib = r.template("rcmp_ib"); @@ -1626,6 +1628,34 @@ pub fn define( e.enc_x86_64_isap(instruction, template, use_sse2); } + // SIMD insertlane + let mut insertlane_mapping: HashMap, SettingPredicateNumber)> = HashMap::new(); + insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], use_sse41)); // PINSRB + insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], use_sse2)); // PINSRW + insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRD + insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRQ, only x86_64 + + for ty in ValueType::all_lane_types() { + if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) { + let number_of_lanes = 128 / ty.lane_bits(); + let instruction = insertlane.bind_vector(ty, number_of_lanes); + let template = rec_r_ib_unsigned_r.opcodes(opcode.clone()); + if ty.lane_bits() < 64 { + e.enc_32_64_isap(instruction, template.nonrex(), isap.clone()); + } else { + // turns out the 64-bit widths have REX/W encodings and only are available on x86_64 + e.enc64_isap(instruction, template.rex().w(), isap.clone()); + } + } + } + + // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { + let instruction = bitcast.bind_vector(ty, 16).bind(F64); + e.enc32_rec(instruction.clone(), rec_null_fpr, 0); + e.enc64_rec(instruction, rec_null_fpr, 0); + } + // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8) for from_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) { for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type) diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index f2353d3a66..11063e39b9 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -373,6 +373,7 @@ pub fn define<'shared>( let f_float_cond_trap = formats.by_name("FloatCondTrap"); let f_func_addr = formats.by_name("FuncAddr"); let f_indirect_jump = formats.by_name("IndirectJump"); + let f_insert_lane = formats.by_name("InsertLane"); let f_int_compare = formats.by_name("IntCompare"); let f_int_compare_imm = formats.by_name("IntCompareImm"); let f_int_cond = formats.by_name("IntCond"); @@ -816,6 +817,27 @@ pub fn define<'shared>( ); } + // XX /r ib with 8-bit unsigned immediate (e.g. for insertlane) + { + let format = formats.get(f_insert_lane); + recipes.add_template_recipe( + EncodingRecipeBuilder::new("r_ib_unsigned_r", f_insert_lane, 2) + .operands_in(vec![fpr, gpr]) + .operands_out(vec![0]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + format, "lane", 8, 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); + modrm_rr(in_reg1, in_reg0, sink); + let imm:i64 = lane.into(); + sink.put1(imm as u8); + "#, + ), + ); + } + { // XX /n id with 32-bit immediate sign-extended. UnaryImm version. let format = formats.get(f_unary_imm); diff --git a/cranelift/filetests/filetests/isa/x86/insertlane.clif b/cranelift/filetests/filetests/isa/x86/insertlane.clif new file mode 100644 index 0000000000..c55dc40333 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/insertlane.clif @@ -0,0 +1,39 @@ +test binemit +set enable_simd +target x86_64 haswell + +function %test_insertlane_b8() { +ebb0: +[-, %rax] v0 = bconst.b8 true +[-, %rbx] v1 = bconst.b8 false +[-, %xmm0] v2 = splat.b8x16 v0 +[-, %xmm0] v3 = insertlane v2, 10, v1 ; bin: 66 0f 3a 20 c3 0a + return +} + +function %test_insertlane_i16() { +ebb0: +[-, %rax] v0 = iconst.i16 4 +[-, %rbx] v1 = iconst.i16 5 +[-, %xmm1] v2 = splat.i16x8 v0 +[-, %xmm1] v3 = insertlane v2, 4, v1 ; bin: 66 0f c4 cb 04 + return +} + +function %test_insertlane_i32() { +ebb0: +[-, %rax] v0 = iconst.i32 42 +[-, %rbx] v1 = iconst.i32 99 +[-, %xmm4] v2 = splat.i32x4 v0 +[-, %xmm4] v3 = insertlane v2, 2, v1 ; bin: 66 0f 3a 22 e3 02 + return +} + +function %test_insertlane_f64() { +ebb0: +[-, %rax] v0 = f64const 0x0.0 +[-, %rbx] v1 = f64const 0x4.2 +[-, %xmm2] v2 = splat.f64x2 v0 +[-, %xmm2] v3 = insertlane v2, 1, v1 ; bin: 66 48 0f 3a 22 d3 01 + return +}