From 3fdc78174ff3fe30d807961821ca870513acf3e2 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Thu, 11 Jul 2019 15:49:28 -0700 Subject: [PATCH] Add x86 implementation of extractlane instruction --- .../codegen/meta/src/isa/x86/encodings.rs | 29 +++++++++++++-- cranelift/codegen/meta/src/isa/x86/recipes.rs | 23 +++++++++++- .../filetests/isa/x86/extractlane.clif | 35 +++++++++++++++++++ 3 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x86/extractlane.clif diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index eaa5614bd0..ed4cf18d94 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -318,6 +318,7 @@ pub fn define( let copy_special = shared.by_name("copy_special"); let ctz = shared.by_name("ctz"); let debugtrap = shared.by_name("debugtrap"); + let extractlane = shared.by_name("extractlane"); let f32const = shared.by_name("f32const"); let f64const = shared.by_name("f64const"); let fadd = shared.by_name("fadd"); @@ -498,7 +499,8 @@ pub fn define( let rec_pushq = r.template("pushq"); let rec_ret = r.template("ret"); let rec_r_ib = r.template("r_ib"); - let rec_r_ib_unsigned = r.template("r_ib_unsigned"); + let rec_r_ib_unsigned_gpr = r.template("r_ib_unsigned_gpr"); + let rec_r_ib_unsigned_fpr = r.template("r_ib_unsigned_fpr"); let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r"); let rec_r_id = r.template("r_id"); let rec_rcmp = r.template("rcmp"); @@ -1642,7 +1644,9 @@ pub fn define( for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { let number_of_lanes = 128 / ty.lane_bits(); let instruction = x86_pshufd.bind_vector(ty, number_of_lanes); - let template = rec_r_ib_unsigned.nonrex().opcodes(vec![0x66, 0x0f, 0x70]); + let template = rec_r_ib_unsigned_fpr + .nonrex() + .opcodes(vec![0x66, 0x0f, 0x70]); e.enc32_isap(instruction.clone(), template.clone(), use_sse2); e.enc64_isap(instruction, template, use_sse2); } @@ -1682,6 +1686,27 @@ pub fn define( } } + // SIMD extractlane + let mut extractlane_mapping: HashMap, SettingPredicateNumber)> = HashMap::new(); + extractlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x14], use_sse41)); // PEXTRB + extractlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc5], use_sse2)); // PEXTRW, SSE4.1 has a PEXTRW that can move to reg/m16 but the opcode is four bytes + extractlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRD + extractlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x16], use_sse41)); // PEXTRQ, only x86_64 + + for ty in ValueType::all_lane_types() { + if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) { + let number_of_lanes = 128 / ty.lane_bits(); + let instruction = extractlane.bind_vector(ty, number_of_lanes); + let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone()); + if ty.lane_bits() < 64 { + e.enc_32_64_isap(instruction, template.nonrex(), isap.clone()); + } else { + // turns out the 64-bit widths have REX/W encodings and only are available on x86_64 + e.enc64_isap(instruction, template.rex().w(), isap.clone()); + } + } + } + // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { let instruction = bitcast.bind_vector(ty, 16).bind(F64); diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index b948c0c2e5..623689cea9 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -800,7 +800,7 @@ pub fn define<'shared>( { let format = formats.get(f_extract_lane); recipes.add_template_recipe( - EncodingRecipeBuilder::new("r_ib_unsigned", f_extract_lane, 2) + EncodingRecipeBuilder::new("r_ib_unsigned_fpr", f_extract_lane, 2) .operands_in(vec![fpr]) .operands_out(vec![fpr]) .inst_predicate(InstructionPredicate::new_is_unsigned_int( @@ -817,6 +817,27 @@ pub fn define<'shared>( ); } + // XX /r ib with 8-bit unsigned immediate (e.g. for extractlane) + { + let format = formats.get(f_extract_lane); + recipes.add_template_recipe( + EncodingRecipeBuilder::new("r_ib_unsigned_gpr", f_extract_lane, 2) + .operands_in(vec![fpr]) + .operands_out(vec![gpr]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + format, "lane", 8, 0, + )) + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(out_reg0, in_reg0, sink); // note the flipped register in the ModR/M byte + let imm:i64 = lane.into(); + sink.put1(imm as u8); + "#, + ), + ); + } + // XX /r ib with 8-bit unsigned immediate (e.g. for insertlane) { let format = formats.get(f_insert_lane); diff --git a/cranelift/filetests/filetests/isa/x86/extractlane.clif b/cranelift/filetests/filetests/isa/x86/extractlane.clif new file mode 100644 index 0000000000..e7a1ea898e --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/extractlane.clif @@ -0,0 +1,35 @@ +test binemit +set enable_simd +target x86_64 haswell + +function %test_extractlane_b8() { +ebb0: +[-, %rax] v0 = bconst.b8 true +[-, %xmm0] v1 = splat.b8x16 v0 +[-, %rax] v2 = extractlane v1, 10 ; bin: 66 0f 3a 14 c0 0a + return +} + +function %test_extractlane_i16() { +ebb0: +[-, %rax] v0 = iconst.i16 4 +[-, %xmm1] v1 = splat.i16x8 v0 +[-, %rax] v2 = extractlane v1, 4 ; bin: 66 0f c5 c8 04 + return +} + +function %test_extractlane_i32() { +ebb0: +[-, %rax] v0 = iconst.i32 42 +[-, %xmm4] v1 = splat.i32x4 v0 +[-, %rcx] v2 = extractlane v1, 2 ; bin: 66 0f 3a 16 e1 02 + return +} + +function %test_extractlane_f64() { +ebb0: +[-, %rax] v0 = f64const 0x0.0 +[-, %xmm2] v1 = splat.f64x2 v0 +[-, %rbx] v2 = extractlane v1, 1 ; bin: 66 48 0f 3a 16 d3 01 + return +}