Add x86 implementation of SIMD load_extend instructions
This commit is contained in:
@@ -1600,6 +1600,9 @@ fn define_simd(
|
|||||||
let regspill = shared.by_name("regspill");
|
let regspill = shared.by_name("regspill");
|
||||||
let sadd_sat = shared.by_name("sadd_sat");
|
let sadd_sat = shared.by_name("sadd_sat");
|
||||||
let scalar_to_vector = shared.by_name("scalar_to_vector");
|
let scalar_to_vector = shared.by_name("scalar_to_vector");
|
||||||
|
let sload8x8 = shared.by_name("sload8x8");
|
||||||
|
let sload16x4 = shared.by_name("sload16x4");
|
||||||
|
let sload32x2 = shared.by_name("sload32x2");
|
||||||
let spill = shared.by_name("spill");
|
let spill = shared.by_name("spill");
|
||||||
let sqrt = shared.by_name("sqrt");
|
let sqrt = shared.by_name("sqrt");
|
||||||
let sshr_imm = shared.by_name("sshr_imm");
|
let sshr_imm = shared.by_name("sshr_imm");
|
||||||
@@ -1607,6 +1610,9 @@ fn define_simd(
|
|||||||
let store = shared.by_name("store");
|
let store = shared.by_name("store");
|
||||||
let store_complex = shared.by_name("store_complex");
|
let store_complex = shared.by_name("store_complex");
|
||||||
let uadd_sat = shared.by_name("uadd_sat");
|
let uadd_sat = shared.by_name("uadd_sat");
|
||||||
|
let uload8x8 = shared.by_name("uload8x8");
|
||||||
|
let uload16x4 = shared.by_name("uload16x4");
|
||||||
|
let uload32x2 = shared.by_name("uload32x2");
|
||||||
let ushr_imm = shared.by_name("ushr_imm");
|
let ushr_imm = shared.by_name("ushr_imm");
|
||||||
let usub_sat = shared.by_name("usub_sat");
|
let usub_sat = shared.by_name("usub_sat");
|
||||||
let vconst = shared.by_name("vconst");
|
let vconst = shared.by_name("vconst");
|
||||||
@@ -1926,6 +1932,31 @@ fn define_simd(
|
|||||||
e.enc_32_64_rec(bound_copy_nop, rec_stacknull, 0);
|
e.enc_32_64_rec(bound_copy_nop, rec_stacknull, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SIMD load extend
|
||||||
|
for (inst, opcodes) in &[
|
||||||
|
(uload8x8, &PMOVZXBW),
|
||||||
|
(uload16x4, &PMOVZXWD),
|
||||||
|
(uload32x2, &PMOVZXDQ),
|
||||||
|
(sload8x8, &PMOVSXBW),
|
||||||
|
(sload16x4, &PMOVSXWD),
|
||||||
|
(sload32x2, &PMOVSXDQ),
|
||||||
|
] {
|
||||||
|
let isap = Some(use_sse41_simd);
|
||||||
|
for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] {
|
||||||
|
let inst = *inst;
|
||||||
|
let template = recipe.opcodes(*opcodes);
|
||||||
|
e.enc32_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
|
||||||
|
// REX-less encoding must come after REX encoding so we don't use it by
|
||||||
|
// default. Otherwise reg-alloc would never use r8 and up.
|
||||||
|
e.enc64_maybe_isap(inst.clone().bind(I32), template.clone().rex(), isap);
|
||||||
|
e.enc64_maybe_isap(inst.clone().bind(I32), template.clone(), isap);
|
||||||
|
// Similar to above; TODO some of this duplication can be cleaned up by infer_rex()
|
||||||
|
// tracked in https://github.com/bytecodealliance/cranelift/issues/1090
|
||||||
|
e.enc64_maybe_isap(inst.clone().bind(I64), template.clone().rex(), isap);
|
||||||
|
e.enc64_maybe_isap(inst.bind(I64), template, isap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// SIMD integer addition
|
// SIMD integer addition
|
||||||
for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] {
|
for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] {
|
||||||
let iadd = iadd.bind(vector(*ty, sse_vector_size));
|
let iadd = iadd.bind(vector(*ty, sse_vector_size));
|
||||||
|
|||||||
@@ -417,6 +417,30 @@ pub static PMINUD: [u8; 4] = [0x66, 0x0f, 0x38, 0x3b];
|
|||||||
/// xmm1 (SSE4.1).
|
/// xmm1 (SSE4.1).
|
||||||
pub static PMINUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3a];
|
pub static PMINUW: [u8; 4] = [0x66, 0x0f, 0x38, 0x3a];
|
||||||
|
|
||||||
|
/// Sign extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
|
||||||
|
/// integers in xmm1 (SSE4.1).
|
||||||
|
pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20];
|
||||||
|
|
||||||
|
/// Sign extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit
|
||||||
|
/// integers in xmm1 (SSE4.1).
|
||||||
|
pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23];
|
||||||
|
|
||||||
|
/// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
|
||||||
|
/// integers in xmm1.
|
||||||
|
pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25];
|
||||||
|
|
||||||
|
/// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
|
||||||
|
/// integers in xmm1 (SSE4.1).
|
||||||
|
pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30];
|
||||||
|
|
||||||
|
/// Zero extend 4 packed 16-bit integers in the low 8 bytes of xmm2/m64 to 4 packed 32-bit
|
||||||
|
/// integers in xmm1 (SSE4.1).
|
||||||
|
pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33];
|
||||||
|
|
||||||
|
/// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
|
||||||
|
/// integers in xmm1.
|
||||||
|
pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35];
|
||||||
|
|
||||||
/// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of
|
/// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of
|
||||||
/// the results in xmm1 (SSE2).
|
/// the results in xmm1 (SSE2).
|
||||||
pub static PMULLW: [u8; 3] = [0x66, 0x0f, 0xd5];
|
pub static PMULLW: [u8; 3] = [0x66, 0x0f, 0xd5];
|
||||||
|
|||||||
@@ -36,3 +36,33 @@ block0:
|
|||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function %uload_extend() {
|
||||||
|
block0:
|
||||||
|
[-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef
|
||||||
|
[-,%xmm2] v3 = uload8x8 v1+0 ; bin: heap_oob 66 40 0f 38 30 12
|
||||||
|
[-,%xmm2] v4 = uload8x8 v1+20 ; bin: heap_oob 66 40 0f 38 30 52 14
|
||||||
|
[-,%xmm2] v5 = uload8x8 v1+256 ; bin: heap_oob 66 40 0f 38 30 92 00000100
|
||||||
|
[-,%xmm2] v6 = uload16x4 v1+0 ; bin: heap_oob 66 40 0f 38 33 12
|
||||||
|
[-,%xmm2] v7 = uload16x4 v1+20 ; bin: heap_oob 66 40 0f 38 33 52 14
|
||||||
|
[-,%xmm2] v8 = uload16x4 v1+256 ; bin: heap_oob 66 40 0f 38 33 92 00000100
|
||||||
|
[-,%xmm2] v9 = uload32x2 v1+0 ; bin: heap_oob 66 40 0f 38 35 12
|
||||||
|
[-,%xmm2] v10 = uload32x2 v1+20 ; bin: heap_oob 66 40 0f 38 35 52 14
|
||||||
|
[-,%xmm2] v11 = uload32x2 v1+256 ; bin: heap_oob 66 40 0f 38 35 92 00000100
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
function %sload_extend() {
|
||||||
|
block0:
|
||||||
|
[-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef
|
||||||
|
[-,%xmm2] v3 = sload8x8 v1+0 ; bin: heap_oob 66 40 0f 38 20 12
|
||||||
|
[-,%xmm2] v4 = sload8x8 v1+20 ; bin: heap_oob 66 40 0f 38 20 52 14
|
||||||
|
[-,%xmm2] v5 = sload8x8 v1+256 ; bin: heap_oob 66 40 0f 38 20 92 00000100
|
||||||
|
[-,%xmm2] v6 = sload16x4 v1+0 ; bin: heap_oob 66 40 0f 38 23 12
|
||||||
|
[-,%xmm2] v7 = sload16x4 v1+20 ; bin: heap_oob 66 40 0f 38 23 52 14
|
||||||
|
[-,%xmm2] v8 = sload16x4 v1+256 ; bin: heap_oob 66 40 0f 38 23 92 00000100
|
||||||
|
[-,%xmm2] v9 = sload32x2 v1+0 ; bin: heap_oob 66 40 0f 38 25 12
|
||||||
|
[-,%xmm2] v10 = sload32x2 v1+20 ; bin: heap_oob 66 40 0f 38 25 52 14
|
||||||
|
[-,%xmm2] v11 = sload32x2 v1+256 ; bin: heap_oob 66 40 0f 38 25 92 00000100
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user