diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index a029e8170f..f7f8964905 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -521,6 +521,7 @@ pub(crate) fn define<'defs>( let x86_pinsr = x86.by_name("x86_pinsr"); let x86_pshufd = x86.by_name("x86_pshufd"); let x86_pshufb = x86.by_name("x86_pshufb"); + let x86_psll = x86.by_name("x86_psll"); let x86_push = x86.by_name("x86_push"); let x86_sdivmodx = x86.by_name("x86_sdivmodx"); let x86_smulx = x86.by_name("x86_smulx"); @@ -1988,6 +1989,26 @@ pub(crate) fn define<'defs>( e.enc_32_64(bxor, rec_fa.opcodes(&PXOR)); } + // SIMD bitcast from I32/I64 to the low bits of a vector (e.g. I64x2); this register movement + // allows SIMD shifts to be legalized more easily. TODO ideally this would be typed as an + // I128x1 but restrictions on the type builder prevent this; the general idea here is that + // the upper bits are all zeroed and do not form parts of any separate lane. See + // https://github.com/CraneStation/cranelift/issues/1146. + e.enc_both( + bitcast.bind(vector(I64, sse_vector_size)).bind(I32), + rec_frurm.opcodes(&MOVD_LOAD_XMM), + ); + e.enc64( + bitcast.bind(vector(I64, sse_vector_size)).bind(I64), + rec_frurm.opcodes(&MOVD_LOAD_XMM).rex().w(), + ); + + // SIMD shift left + for (ty, opcodes) in &[(I16, &PSLLW), (I32, &PSLLD), (I64, &PSLLQ)] { + let x86_psll = x86_psll.bind(vector(*ty, sse_vector_size)); + e.enc_32_64(x86_psll, rec_fa.opcodes(*opcodes)); + } + // SIMD icmp using PCMPEQ* for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) { let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() { diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index b9f2496a85..c8839e78a8 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -387,5 +387,40 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let IxN = &TypeVar::new( + "IxN", + "A SIMD vector type containing integers", + TypeSetBuilder::new() + .ints(Interval::All) + .simd_lanes(Interval::All) + .includes_scalars(false) + .build(), + ); + let I64x2 = &TypeVar::new( + "I64x2", + "A SIMD vector type containing one large integer (the upper lane is concatenated with \ + the lower lane to form the integer)", + TypeSetBuilder::new() + .ints(64..64) + .simd_lanes(2..2) + .includes_scalars(false) + .build(), + ); + let x = &operand_doc("x", IxN, "Vector value to shift"); + let y = &operand_doc("y", I64x2, "Number of bits to shift"); + let a = &operand("a", IxN); + ig.push( + Inst::new( + "x86_psll", + r#" + Shift Packed Data Left Logical -- This implements the behavior of the shared instruction + ``ishl`` but alters the shift operand to live in an XMM register as expected by the PSSL* + family of instructions. + "#, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + ig.build() } diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 04951c3d5b..8b71bfd637 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -3,7 +3,7 @@ use crate::cdsl::instructions::{vector, Bindable, InstructionGroup}; use crate::cdsl::types::{LaneType, ValueType}; use crate::cdsl::xform::TransformGroupBuilder; use crate::shared::types::Float::F64; -use crate::shared::types::Int::{I32, I64}; +use crate::shared::types::Int::{I16, I32, I64}; use crate::shared::Definitions as SharedDefinitions; pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup) { @@ -20,6 +20,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct // List of instructions. let insts = &shared.instructions; let band = insts.by_name("band"); + let bitcast = insts.by_name("bitcast"); let bor = insts.by_name("bor"); let bnot = insts.by_name("bnot"); let bxor = insts.by_name("bxor"); @@ -40,6 +41,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct let imul = insts.by_name("imul"); let ineg = insts.by_name("ineg"); let insertlane = insts.by_name("insertlane"); + let ishl = insts.by_name("ishl"); let isub = insts.by_name("isub"); let popcnt = insts.by_name("popcnt"); let raw_bitcast = insts.by_name("raw_bitcast"); @@ -60,6 +62,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct let x86_bsr = x86_instructions.by_name("x86_bsr"); let x86_pshufb = x86_instructions.by_name("x86_pshufb"); let x86_pshufd = x86_instructions.by_name("x86_pshufd"); + let x86_psll = x86_instructions.by_name("x86_psll"); let x86_umulx = x86_instructions.by_name("x86_umulx"); let x86_smulx = x86_instructions.by_name("x86_smulx"); @@ -394,6 +397,16 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct ); } + // SIMD shift left + for ty in &[I16, I32, I64] { + let ishl = ishl.bind(vector(*ty, sse_vector_size)); + let bitcast = bitcast.bind(vector(I64, sse_vector_size)); + narrow.legalize( + def!(a = ishl(x, y)), + vec![def!(b = bitcast(y)), def!(a = x86_psll(x, b))], + ); + } + narrow.custom_legalize(shuffle, "convert_shuffle"); narrow.custom_legalize(extractlane, "convert_extractlane"); narrow.custom_legalize(insertlane, "convert_insertlane"); diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index b7f223eb27..6e3859d848 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -320,6 +320,15 @@ pub static PSHUFB: [u8; 4] = [0x66, 0x0f, 0x38, 0x00]; /// store the result in xmm1 (SSE2). pub static PSHUFD: [u8; 3] = [0x66, 0x0f, 0x70]; +/// Shift words in xmm1 left by xmm2/m128 while shifting in 0s (SSE2). +pub static PSLLW: [u8; 3] = [0x66, 0x0f, 0xf1]; + +/// Shift doublewords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2). +pub static PSLLD: [u8; 3] = [0x66, 0x0f, 0xf2]; + +/// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2). +pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3]; + /// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2). pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8]; diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-binemit.clif new file mode 100644 index 0000000000..5cfb4375d7 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-binemit.clif @@ -0,0 +1,21 @@ +test binemit +set enable_simd +target x86_64 skylake + +function %ishl_i16x8(i16x8, i64x2) -> i16x8 { +ebb0(v0: i16x8 [%xmm2], v1: i64x2 [%xmm1]): +[-, %xmm2] v2 = x86_psll v0, v1 ; bin: 66 0f f1 d1 + return v2 +} + +function %ishl_i32x4(i32x4, i64x2) -> i32x4 { +ebb0(v0: i32x4 [%xmm4], v1: i64x2 [%xmm0]): +[-, %xmm4] v2 = x86_psll v0, v1 ; bin: 66 0f f2 e0 + return v2 +} + +function %ishl_i64x2(i64x2, i64x2) -> i64x2 { +ebb0(v0: i64x2 [%xmm6], v1: i64x2 [%xmm3]): +[-, %xmm6] v2 = x86_psll v0, v1 ; bin: 66 0f f3 f3 + return v2 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif new file mode 100644 index 0000000000..5c2893950d --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif @@ -0,0 +1,13 @@ +test legalizer +set enable_simd +target x86_64 skylake + +function %ishl_i32x4() -> i32x4 { +ebb0: + v0 = iconst.i32 1 + v1 = vconst.i32x4 [1 2 4 8] + v2 = ishl v1, v0 + ; check: v3 = bitcast.i64x2 v0 + ; nextln: v2 = x86_psll v1, v3 + return v2 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif new file mode 100644 index 0000000000..224b3d5470 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif @@ -0,0 +1,39 @@ +test run +set enable_simd +target x86_64 skylake + +; TODO: once available, replace all lane extraction with `icmp + all_ones` + +function %ishl_i32x4() -> b1 { +ebb0: + v0 = iconst.i32 1 + v1 = vconst.i32x4 [1 2 4 8] + v2 = ishl v1, v0 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 2 + + v5 = extractlane v2, 3 + v6 = icmp_imm eq v5, 16 + + v7 = band v4, v6 + return v7 +} +; run + +function %ishl_too_large_i16x8() -> b1 { +ebb0: + v0 = iconst.i32 17 ; note that this will shift off the end of each lane + v1 = vconst.i16x8 [1 2 4 8 16 32 64 128] + v2 = ishl v1, v0 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 0 + + v5 = extractlane v2, 3 + v6 = icmp_imm eq v5, 0 + + v7 = band v4, v6 + return v7 +} +; run