diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index c34c13d089..32c1e2f957 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -250,6 +250,17 @@ impl PerCpuModeEncodings { self.enc64(inst.clone().bind(I64).bind_any(), template); } } + + /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened + fn enc_32_64_isap( + &mut self, + inst: BoundInstruction, + template: Template, + isap: SettingPredicateNumber, + ) { + self.enc32_isap(inst.clone(), template.clone(), isap); + self.enc64_isap(inst, template, isap); + } } // Definitions. @@ -379,6 +390,8 @@ pub fn define( let x86_fmax = x86.by_name("x86_fmax"); let x86_fmin = x86.by_name("x86_fmin"); let x86_pop = x86.by_name("x86_pop"); + let x86_pshufd = x86.by_name("x86_pshufd"); + let x86_pshufb = x86.by_name("x86_pshufb"); let x86_push = x86.by_name("x86_push"); let x86_sdivmodx = x86.by_name("x86_sdivmodx"); let x86_smulx = x86.by_name("x86_smulx"); @@ -462,6 +475,7 @@ pub fn define( let rec_pushq = r.template("pushq"); let rec_ret = r.template("ret"); let rec_r_ib = r.template("r_ib"); + let rec_r_ib_unsigned = r.template("r_ib_unsigned"); let rec_r_id = r.template("r_id"); let rec_rcmp = r.template("rcmp"); let rec_rcmp_ib = r.template("rcmp_ib"); @@ -519,6 +533,7 @@ pub fn define( let use_lzcnt = settings.predicate_by_name("use_lzcnt"); let use_bmi1 = settings.predicate_by_name("use_bmi1"); let use_sse2 = settings.predicate_by_name("use_sse2"); + let use_ssse3 = settings.predicate_by_name("use_ssse3"); let use_sse41 = settings.predicate_by_name("use_sse41"); // Definitions. @@ -1575,6 +1590,28 @@ pub fn define( e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e])); e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e])); + // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see + // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the + // value across the register + + // PSHUFB, 8-bit shuffle using two XMM registers + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { + let number_of_lanes = 128 / ty.lane_bits(); + let instruction = x86_pshufb.bind_vector(ty, number_of_lanes); + let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 0x00]); + e.enc32_isap(instruction.clone(), template.clone(), use_ssse3); + e.enc64_isap(instruction, template, use_ssse3); + } + + // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { + let number_of_lanes = 128 / ty.lane_bits(); + let instruction = x86_pshufd.bind_vector(ty, number_of_lanes); + let template = rec_r_ib_unsigned.nonrex().opcodes(vec![0x66, 0x0f, 0x70]); + e.enc32_isap(instruction.clone(), template.clone(), use_sse2); + e.enc64_isap(instruction, template, use_sse2); + } + // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according // to the Intel manual: "When the destination operand is an XMM register, the source operand is // written to the low doubleword of the register and the regiser is zero-extended to 128 bits." diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 9aa7363e1a..6464261938 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -7,7 +7,7 @@ use crate::cdsl::instructions::{ use crate::cdsl::operands::{create_operand as operand, create_operand_doc as operand_doc}; use crate::cdsl::types::ValueType; use crate::cdsl::typevar::{Interval, TypeSetBuilder, TypeVar}; -use crate::shared::types; +use crate::shared::{immediates, types, OperandKinds}; pub fn define( mut all_instructions: &mut AllInstructions, @@ -249,5 +249,46 @@ pub fn define( .operands_out(vec![y, rflags]), ); + let immediates = OperandKinds::from(immediates::define()); + let uimm8 = immediates.by_name("uimm8"); + let TxN = &TypeVar::new( + "TxN", + "A SIMD vector type", + TypeSetBuilder::new() + .ints(Interval::All) + .floats(Interval::All) + .bools(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let a = &operand_doc("a", TxN, "A vector value (i.e. held in an XMM register)"); + let b = &operand_doc("b", TxN, "A vector value (i.e. held in an XMM register)"); + let i = &operand_doc("i", uimm8, "An ordering operand controlling the copying of data from the source to the destination; see PSHUFD in Intel manual for details"); + + ig.push( + Inst::new( + "x86_pshufd", + r#" + Packed Shuffle Doublewords -- copies data from either memory or lanes in an extended + register and re-orders the data according to the passed immediate byte. + "#, + ) + .operands_in(vec![a, i]) // TODO allow copying from memory here (need more permissive type than TxN) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "x86_pshufb", + r#" + Packed Shuffle Bytes -- re-orders data in an extended register using a shuffle + mask from either memory or another extended register + "#, + ) + .operands_in(vec![a, b]) // TODO allow re-ordering from memory here (need more permissive type than TxN) + .operands_out(vec![a]), + ); + ig.build() } diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index 45441cf67a..f2353d3a66 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -367,6 +367,7 @@ pub fn define<'shared>( let f_call = formats.by_name("Call"); let f_call_indirect = formats.by_name("CallIndirect"); let f_copy_special = formats.by_name("CopySpecial"); + let f_extract_lane = formats.by_name("ExtractLane"); // TODO this would preferably retrieve a BinaryImm8 format but because formats are compared structurally and ExtractLane has the same structure this is impossible--if we rename ExtractLane, it may even impact parsing let f_float_compare = formats.by_name("FloatCompare"); let f_float_cond = formats.by_name("FloatCond"); let f_float_cond_trap = formats.by_name("FloatCondTrap"); @@ -794,6 +795,27 @@ pub fn define<'shared>( ); } + // XX /r ib with 8-bit unsigned immediate (e.g. for pshufd) + { + let format = formats.get(f_extract_lane); + recipes.add_template_recipe( + EncodingRecipeBuilder::new("r_ib_unsigned", f_extract_lane, 2) + .operands_in(vec![fpr]) + .operands_out(vec![fpr]) + .inst_predicate(InstructionPredicate::new_is_unsigned_int( + format, "lane", 8, 0, + )) // TODO if the format name is changed then "lane" should be renamed to something more appropriate--ordering mask? broadcast immediate? + .emit( + r#" + {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); + modrm_rr(in_reg0, out_reg0, sink); + let imm:i64 = lane.into(); + sink.put1(imm as u8); + "#, + ), + ); + } + { // XX /n id with 32-bit immediate sign-extended. UnaryImm version. let format = formats.get(f_unary_imm); diff --git a/cranelift/filetests/filetests/isa/x86/pshufb.clif b/cranelift/filetests/filetests/isa/x86/pshufb.clif new file mode 100644 index 0000000000..7c23c5ab61 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/pshufb.clif @@ -0,0 +1,13 @@ +test binemit +set enable_simd +target x86_64 has_sse2=true has_ssse3=true + +function %test_pshufb() { +ebb0: +[-, %rax] v0 = iconst.i8 42 +[-, %xmm0] v1 = scalar_to_vector.i8x16 v0 ; bin: 66 40 0f 6e c0 +[-, %rbx] v2 = iconst.i8 43 +[-, %xmm4] v3 = scalar_to_vector.i8x16 v2 ; bin: 66 40 0f 6e e3 +[-, %xmm0] v4 = x86_pshufb v1, v3 ; bin: 66 0f 38 00 c4 + return +} diff --git a/cranelift/filetests/filetests/isa/x86/pshufd.clif b/cranelift/filetests/filetests/isa/x86/pshufd.clif new file mode 100644 index 0000000000..183af4fc0e --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/pshufd.clif @@ -0,0 +1,11 @@ +test binemit +set enable_simd +target x86_64 has_sse2=true + +function %test_pshuf() { +ebb0: +[-, %rax] v0 = iconst.i32 42 +[-, %xmm0] v1 = scalar_to_vector.i32x4 v0 ; bin: 66 40 0f 6e c0 +[-, %xmm0] v2 = x86_pshufd v1, 0 ; bin: 66 0f 70 c0 00 + return +}