diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index abd7c7ca90..6bcc2b94f4 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -326,6 +326,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro let fcmp = insts.by_name("fcmp"); let fabs = insts.by_name("fabs"); let fneg = insts.by_name("fneg"); + let iadd_imm = insts.by_name("iadd_imm"); let icmp = insts.by_name("icmp"); let imax = insts.by_name("imax"); let imin = insts.by_name("imin"); @@ -349,6 +350,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro let vall_true = insts.by_name("vall_true"); let vany_true = insts.by_name("vany_true"); + let x86_packss = x86_instructions.by_name("x86_packss"); let x86_pmaxs = x86_instructions.by_name("x86_pmaxs"); let x86_pmaxu = x86_instructions.by_name("x86_pmaxu"); let x86_pmins = x86_instructions.by_name("x86_pmins"); @@ -357,6 +359,8 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro let x86_pshufd = x86_instructions.by_name("x86_pshufd"); let x86_psra = x86_instructions.by_name("x86_psra"); let x86_ptest = x86_instructions.by_name("x86_ptest"); + let x86_punpckh = x86_instructions.by_name("x86_punpckh"); + let x86_punpckl = x86_instructions.by_name("x86_punpckl"); let imm = &shared.imm; @@ -373,6 +377,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro // Set up variables and immediates. let uimm8_zero = Literal::constant(&imm.uimm8, 0x00); let uimm8_one = Literal::constant(&imm.uimm8, 0x01); + let uimm8_eight = Literal::constant(&imm.uimm8, 8); let u128_zeroes = constant(vec![0x00; 16]); let u128_ones = constant(vec![0xff; 16]); let u128_seventies = constant(vec![0x70; 16]); @@ -381,8 +386,12 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro let c = var("c"); let d = var("d"); let e = var("e"); + let f = var("f"); + let g = var("g"); + let h = var("h"); let x = var("x"); let y = var("y"); + let z = var("z"); // Limit the SIMD vector size: eventually multiple vector sizes may be supported // but for now only SSE-sized vectors are available. @@ -484,13 +493,37 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro ); } - // SIMD shift left (arithmetic) + // SIMD shift right (arithmetic) for ty in &[I16, I32, I64] { let sshr = sshr.bind(vector(*ty, sse_vector_size)); - let bitcast = bitcast.bind(vector(I64, sse_vector_size)); + let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size)); narrow.legalize( def!(a = sshr(x, y)), - vec![def!(b = bitcast(y)), def!(a = x86_psra(x, b))], + vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))], + ); + } + { + let sshr = sshr.bind(vector(I8, sse_vector_size)); + let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size)); + let raw_bitcast_i16x8 = raw_bitcast.bind(vector(I16, sse_vector_size)); + let raw_bitcast_i16x8_again = raw_bitcast.bind(vector(I16, sse_vector_size)); + narrow.legalize( + def!(z = sshr(x, y)), + vec![ + // Since we will use the high byte of each 16x8 lane, shift an extra 8 bits. + def!(a = iadd_imm(y, uimm8_eight)), + def!(b = bitcast_i64x2(a)), + // Take the low 8 bytes of x, duplicate them in 16x8 lanes, then shift right. + def!(c = x86_punpckl(x, x)), + def!(d = raw_bitcast_i16x8(c)), + def!(e = x86_psra(d, b)), + // Take the high 8 bytes of x, duplicate them in 16x8 lanes, then shift right. + def!(f = x86_punpckh(x, x)), + def!(g = raw_bitcast_i16x8_again(f)), + def!(h = x86_psra(g, b)), + // Re-pack the vector. + def!(z = x86_packss(e, h)), + ], ); } diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif index 279b89004b..0b14984ed6 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif @@ -18,6 +18,26 @@ block0: return v2 } +function %sshr_i8x16() -> i8x16 { +block0: + v0 = iconst.i32 1 + v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + v2 = sshr v1, v0 + ; check: v3 = iadd_imm v0, 8 + ; nextln: v4 = bitcast.i64x2 v3 + + ; nextln: v5 = x86_punpckl v1, v1 + ; nextln: v6 = raw_bitcast.i16x8 v5 + ; nextln: v7 = x86_psra v6, v4 + + ; nextln: v8 = x86_punpckh v1, v1 + ; nextln: v9 = raw_bitcast.i16x8 v8 + ; nextln: v10 = x86_psra v9, v4 + + ; nextln: v2 = x86_packss v7, v10 + return v2 +} + function %ishl_i8x16() -> i8x16 { block0: v0 = iconst.i32 1 diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif index b321920409..cceb63cddf 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif @@ -51,6 +51,19 @@ block0: } ; run +function %sshr_i8x16() -> b1 { +block0: + v0 = iconst.i32 1 + v1 = vconst.i8x16 [0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1] + v2 = sshr v1, v0 + + v3 = vconst.i8x16 [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8] + v4 = icmp eq v2, v3 + v5 = vall_true v4 + return v5 +} +; run + function %ishl_i8x16() -> b1 { block0: v0 = iconst.i32 1