From f1904bffea935bcc68f1e9666bf018c46653e144 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Mon, 7 Oct 2019 10:38:35 -0700 Subject: [PATCH] Add x86 SIMD sshr and ushr Only the shifts with applicable SSE2 instructions are implemented here: PSRL* (for ushr) only has 16-64 bit instructions and PSRA* (for sshr) only has 16-32 bit instructions. --- .../codegen/meta/src/isa/x86/encodings.rs | 14 ++++ .../codegen/meta/src/isa/x86/instructions.rs | 26 ++++++- .../codegen/meta/src/isa/x86/legalize.rs | 26 ++++++- cranelift/codegen/meta/src/isa/x86/opcodes.rs | 15 ++++ .../isa/x86/simd-bitwise-binemit.clif | 30 ++++++++ .../isa/x86/simd-bitwise-legalize.clif | 20 ++++++ .../filetests/isa/x86/simd-bitwise-run.clif | 68 +++++++++++++++++++ 7 files changed, 197 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index f7f8964905..8457369929 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -522,6 +522,8 @@ pub(crate) fn define<'defs>( let x86_pshufd = x86.by_name("x86_pshufd"); let x86_pshufb = x86.by_name("x86_pshufb"); let x86_psll = x86.by_name("x86_psll"); + let x86_psra = x86.by_name("x86_psra"); + let x86_psrl = x86.by_name("x86_psrl"); let x86_push = x86.by_name("x86_push"); let x86_sdivmodx = x86.by_name("x86_sdivmodx"); let x86_smulx = x86.by_name("x86_smulx"); @@ -2009,6 +2011,18 @@ pub(crate) fn define<'defs>( e.enc_32_64(x86_psll, rec_fa.opcodes(*opcodes)); } + // SIMD shift right (logical) + for (ty, opcodes) in &[(I16, &PSRLW), (I32, &PSRLD), (I64, &PSRLQ)] { + let x86_psrl = x86_psrl.bind(vector(*ty, sse_vector_size)); + e.enc_32_64(x86_psrl, rec_fa.opcodes(*opcodes)); + } + + // SIMD shift right (arithmetic) + for (ty, opcodes) in &[(I16, &PSRAW), (I32, &PSRAD)] { + let x86_psra = x86_psra.bind(vector(*ty, sse_vector_size)); + e.enc_32_64(x86_psra, rec_fa.opcodes(*opcodes)); + } + // SIMD icmp using PCMPEQ* for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) { let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() { diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index c8839e78a8..6ed1e88999 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -414,13 +414,37 @@ pub(crate) fn define( "x86_psll", r#" Shift Packed Data Left Logical -- This implements the behavior of the shared instruction - ``ishl`` but alters the shift operand to live in an XMM register as expected by the PSSL* + ``ishl`` but alters the shift operand to live in an XMM register as expected by the PSLL* family of instructions. "#, ) .operands_in(vec![x, y]) .operands_out(vec![a]), ); + ig.push( + Inst::new( + "x86_psrl", + r#" + Shift Packed Data Right Logical -- This implements the behavior of the shared instruction + ``ushr`` but alters the shift operand to live in an XMM register as expected by the PSRL* + family of instructions. + "#, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + ig.push( + Inst::new( + "x86_psra", + r#" + Shift Packed Data Right Arithmetic -- This implements the behavior of the shared + instruction ``sshr`` but alters the shift operand to live in an XMM register as expected by + the PSRA* family of instructions. + "#, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); ig.build() } diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 8b71bfd637..8d9033d3a9 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -52,10 +52,12 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct let splat = insts.by_name("splat"); let shuffle = insts.by_name("shuffle"); let srem = insts.by_name("srem"); + let sshr = insts.by_name("sshr"); let udiv = insts.by_name("udiv"); let umulhi = insts.by_name("umulhi"); let ushr_imm = insts.by_name("ushr_imm"); let urem = insts.by_name("urem"); + let ushr = insts.by_name("ushr"); let vconst = insts.by_name("vconst"); let x86_bsf = x86_instructions.by_name("x86_bsf"); @@ -63,6 +65,8 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct let x86_pshufb = x86_instructions.by_name("x86_pshufb"); let x86_pshufd = x86_instructions.by_name("x86_pshufd"); let x86_psll = x86_instructions.by_name("x86_psll"); + let x86_psra = x86_instructions.by_name("x86_psra"); + let x86_psrl = x86_instructions.by_name("x86_psrl"); let x86_umulx = x86_instructions.by_name("x86_umulx"); let x86_smulx = x86_instructions.by_name("x86_smulx"); @@ -397,7 +401,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct ); } - // SIMD shift left + // SIMD shift left (logical) for ty in &[I16, I32, I64] { let ishl = ishl.bind(vector(*ty, sse_vector_size)); let bitcast = bitcast.bind(vector(I64, sse_vector_size)); @@ -407,6 +411,26 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct ); } + // SIMD shift right (logical) + for ty in &[I16, I32, I64] { + let ushr = ushr.bind(vector(*ty, sse_vector_size)); + let bitcast = bitcast.bind(vector(I64, sse_vector_size)); + narrow.legalize( + def!(a = ushr(x, y)), + vec![def!(b = bitcast(y)), def!(a = x86_psrl(x, b))], + ); + } + + // SIMD shift left (arithmetic) + for ty in &[I16, I32, I64] { + let sshr = sshr.bind(vector(*ty, sse_vector_size)); + let bitcast = bitcast.bind(vector(I64, sse_vector_size)); + narrow.legalize( + def!(a = sshr(x, y)), + vec![def!(b = bitcast(y)), def!(a = x86_psra(x, b))], + ); + } + narrow.custom_legalize(shuffle, "convert_shuffle"); narrow.custom_legalize(extractlane, "convert_extractlane"); narrow.custom_legalize(insertlane, "convert_insertlane"); diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index 6e3859d848..0fa7c8a7f7 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -329,6 +329,21 @@ pub static PSLLD: [u8; 3] = [0x66, 0x0f, 0xf2]; /// Shift quadwords in xmm1 left by xmm2/m128 while shifting in 0s (SSE2). pub static PSLLQ: [u8; 3] = [0x66, 0x0f, 0xf3]; +/// Shift words in xmm1 right by xmm2/m128 while shifting in 0s (SSE2). +pub static PSRLW: [u8; 3] = [0x66, 0x0f, 0xd1]; + +/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2). +pub static PSRLD: [u8; 3] = [0x66, 0x0f, 0xd2]; + +/// Shift quadwords in xmm1 right by xmm2/m128 while shifting in 0s (SSE2). +pub static PSRLQ: [u8; 3] = [0x66, 0x0f, 0xd3]; + +/// Shift words in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2). +pub static PSRAW: [u8; 3] = [0x66, 0x0f, 0xe1]; + +/// Shift doublewords in xmm1 right by xmm2/m128 while shifting in sign bits (SSE2). +pub static PSRAD: [u8; 3] = [0x66, 0x0f, 0xe2]; + /// Subtract packed byte integers in xmm2/m128 from packed byte integers in xmm1 (SSE2). pub static PSUBB: [u8; 3] = [0x66, 0x0f, 0xf8]; diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-binemit.clif index 5cfb4375d7..2a6530f7b5 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-binemit.clif @@ -19,3 +19,33 @@ ebb0(v0: i64x2 [%xmm6], v1: i64x2 [%xmm3]): [-, %xmm6] v2 = x86_psll v0, v1 ; bin: 66 0f f3 f3 return v2 } + +function %ushr_i16x8(i16x8, i64x2) -> i16x8 { +ebb0(v0: i16x8 [%xmm2], v1: i64x2 [%xmm1]): +[-, %xmm2] v2 = x86_psrl v0, v1 ; bin: 66 0f d1 d1 + return v2 +} + +function %ushr_i32x4(i32x4, i64x2) -> i32x4 { +ebb0(v0: i32x4 [%xmm4], v1: i64x2 [%xmm0]): +[-, %xmm4] v2 = x86_psrl v0, v1 ; bin: 66 0f d2 e0 + return v2 +} + +function %ushr_i64x2(i64x2, i64x2) -> i64x2 { +ebb0(v0: i64x2 [%xmm6], v1: i64x2 [%xmm3]): +[-, %xmm6] v2 = x86_psrl v0, v1 ; bin: 66 0f d3 f3 + return v2 +} + +function %sshr_i16x8(i16x8, i64x2) -> i16x8 { +ebb0(v0: i16x8 [%xmm2], v1: i64x2 [%xmm1]): +[-, %xmm2] v2 = x86_psra v0, v1 ; bin: 66 0f e1 d1 + return v2 +} + +function %sshr_i32x4(i32x4, i64x2) -> i32x4 { +ebb0(v0: i32x4 [%xmm4], v1: i64x2 [%xmm0]): +[-, %xmm4] v2 = x86_psra v0, v1 ; bin: 66 0f e2 e0 + return v2 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif index 5c2893950d..9c728eb208 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif @@ -11,3 +11,23 @@ ebb0: ; nextln: v2 = x86_psll v1, v3 return v2 } + +function %ushr_i64x2() -> i64x2 { +ebb0: + v0 = iconst.i32 1 + v1 = vconst.i64x2 [1 2] + v2 = ushr v1, v0 + ; check: v3 = bitcast.i64x2 v0 + ; nextln: v2 = x86_psrl v1, v3 + return v2 +} + +function %sshr_i16x8() -> i16x8 { +ebb0: + v0 = iconst.i32 1 + v1 = vconst.i16x8 [1 2 4 8 16 32 64 128] + v2 = sshr v1, v0 + ; check: v3 = bitcast.i64x2 v0 + ; nextln: v2 = x86_psra v1, v3 + return v2 +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif index 224b3d5470..07c50bee0a 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif @@ -37,3 +37,71 @@ ebb0: return v7 } ; run + +function %ushr_i64x2() -> b1 { +ebb0: + v0 = iconst.i32 1 + v1 = vconst.i64x2 [1 2] + v2 = ushr v1, v0 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 0 + + v5 = extractlane v2, 1 + v6 = icmp_imm eq v5, 1 + + v7 = band v4, v6 + return v7 +} +; run + +function %ushr_too_large_i32x4() -> b1 { +ebb0: + v0 = iconst.i32 33 ; note that this will shift off the end of each lane + v1 = vconst.i32x4 [1 2 4 8] + v2 = ushr v1, v0 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 0 + + v5 = extractlane v2, 3 + v6 = icmp_imm eq v5, 0 + + v7 = band v4, v6 + return v7 +} +; run + +function %sshr_i16x8() -> b1 { +ebb0: + v0 = iconst.i32 1 + v1 = vconst.i16x8 [-1 2 4 8 -16 32 64 128] + v2 = sshr v1, v0 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 0xffff ; because of the shifted-in sign-bit, this remains 0xffff == -1 + + v5 = extractlane v2, 4 + v6 = icmp_imm eq v5, 0xfff8 ; -16 has been shifted to -8 == 0xfff8 + + v7 = band v4, v6 + return v7 +} +; run + +function %sshr_too_large_i32x4() -> b1 { +ebb0: + v0 = iconst.i32 33 ; note that this will shift off the end of each lane + v1 = vconst.i32x4 [1 2 4 -8] + v2 = sshr v1, v0 + + v3 = extractlane v2, 0 + v4 = icmp_imm eq v3, 0 + + v5 = extractlane v2, 3 + v6 = icmp_imm eq v5, 0xffff_ffff ; shifting in the sign-bit repeatedly fills the result with 1s + + v7 = band v4, v6 + return v7 +} +; run