diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 8de772c175..081ecde2b7 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1600,6 +1600,9 @@ fn define_simd( let regspill = shared.by_name("regspill"); let sadd_sat = shared.by_name("sadd_sat"); let scalar_to_vector = shared.by_name("scalar_to_vector"); + let sload8x8 = shared.by_name("sload8x8"); + let sload16x4 = shared.by_name("sload16x4"); + let sload32x2 = shared.by_name("sload32x2"); let spill = shared.by_name("spill"); let sqrt = shared.by_name("sqrt"); let sshr_imm = shared.by_name("sshr_imm"); @@ -1607,6 +1610,9 @@ fn define_simd( let store = shared.by_name("store"); let store_complex = shared.by_name("store_complex"); let uadd_sat = shared.by_name("uadd_sat"); + let uload8x8 = shared.by_name("uload8x8"); + let uload16x4 = shared.by_name("uload16x4"); + let uload32x2 = shared.by_name("uload32x2"); let ushr_imm = shared.by_name("ushr_imm"); let usub_sat = shared.by_name("usub_sat"); let vconst = shared.by_name("vconst"); @@ -1926,6 +1932,31 @@ fn define_simd( e.enc_32_64_rec(bound_copy_nop, rec_stacknull, 0); } + // SIMD load extend + for (inst, opcodes) in &[ + (uload8x8, &PMOVZXBW), + (uload16x4, &PMOVZXWD), + (uload32x2, &PMOVZXDQ), + (sload8x8, &PMOVSXBW), + (sload16x4, &PMOVSXWD), + (sload32x2, &PMOVSXDQ), + ] { + let isap = Some(use_sse41_simd); + for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] { + let inst = *inst; + let template = recipe.opcodes(*opcodes); + e.enc32_maybe_isap(inst.clone().bind(I32), template.clone(), isap); + // REX-less encoding must come after REX encoding so we don't use it by + // default. Otherwise reg-alloc would never use r8 and up. + e.enc64_maybe_isap(inst.clone().bind(I32), template.clone().rex(), isap); + e.enc64_maybe_isap(inst.clone().bind(I32), template.clone(), isap); + // Similar to above; TODO some of this duplication can be cleaned up by infer_rex() + // tracked in https://github.com/bytecodealliance/cranelift/issues/1090 + e.enc64_maybe_isap(inst.clone().bind(I64), template.clone().rex(), isap); + e.enc64_maybe_isap(inst.bind(I64), template, isap); + } + } + // SIMD integer addition for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] { let iadd = iadd.bind(vector(*ty, sse_vector_size)); diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index 1a895fe2ec..bbfd05a5d8 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -417,6 +417,30 @@ pub static PMINUD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3b]; /// xmm1 (SSE4.1). pub static PMINUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3a]; +/// Sign extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20]; + +/// Sign extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23]; + +/// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit +/// integers in xmm1. +pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25]; + +/// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30]; + +/// Zero extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit +/// integers in xmm1 (SSE4.1). +pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33]; + +/// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit +/// integers in xmm1. +pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35]; + /// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of /// the results in xmm1 (SSE2). pub static PMULLW: [u8; 3] = [0x66, 0x0f, 0xd5]; diff --git a/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif index 92d83867d7..b164aac343 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif @@ -36,3 +36,33 @@ block0: return } + +function %uload_extend() { +block0: + [-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef + [-,%xmm2] v3 = uload8x8 v1+0 ; bin: heap_oob 66 40 0f 38 30 12 + [-,%xmm2] v4 = uload8x8 v1+20 ; bin: heap_oob 66 40 0f 38 30 52 14 + [-,%xmm2] v5 = uload8x8 v1+256 ; bin: heap_oob 66 40 0f 38 30 92 00000100 + [-,%xmm2] v6 = uload16x4 v1+0 ; bin: heap_oob 66 40 0f 38 33 12 + [-,%xmm2] v7 = uload16x4 v1+20 ; bin: heap_oob 66 40 0f 38 33 52 14 + [-,%xmm2] v8 = uload16x4 v1+256 ; bin: heap_oob 66 40 0f 38 33 92 00000100 + [-,%xmm2] v9 = uload32x2 v1+0 ; bin: heap_oob 66 40 0f 38 35 12 + [-,%xmm2] v10 = uload32x2 v1+20 ; bin: heap_oob 66 40 0f 38 35 52 14 + [-,%xmm2] v11 = uload32x2 v1+256 ; bin: heap_oob 66 40 0f 38 35 92 00000100 + return +} + +function %sload_extend() { +block0: + [-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef + [-,%xmm2] v3 = sload8x8 v1+0 ; bin: heap_oob 66 40 0f 38 20 12 + [-,%xmm2] v4 = sload8x8 v1+20 ; bin: heap_oob 66 40 0f 38 20 52 14 + [-,%xmm2] v5 = sload8x8 v1+256 ; bin: heap_oob 66 40 0f 38 20 92 00000100 + [-,%xmm2] v6 = sload16x4 v1+0 ; bin: heap_oob 66 40 0f 38 23 12 + [-,%xmm2] v7 = sload16x4 v1+20 ; bin: heap_oob 66 40 0f 38 23 52 14 + [-,%xmm2] v8 = sload16x4 v1+256 ; bin: heap_oob 66 40 0f 38 23 92 00000100 + [-,%xmm2] v9 = sload32x2 v1+0 ; bin: heap_oob 66 40 0f 38 25 12 + [-,%xmm2] v10 = sload32x2 v1+20 ; bin: heap_oob 66 40 0f 38 25 52 14 + [-,%xmm2] v11 = sload32x2 v1+256 ; bin: heap_oob 66 40 0f 38 25 92 00000100 + return +}