Add x86 SIMD floating-point arithmetic
This commit is contained in:
@@ -417,6 +417,8 @@ pub(crate) fn define(
|
|||||||
let fill = shared.by_name("fill");
|
let fill = shared.by_name("fill");
|
||||||
let fill_nop = shared.by_name("fill_nop");
|
let fill_nop = shared.by_name("fill_nop");
|
||||||
let floor = shared.by_name("floor");
|
let floor = shared.by_name("floor");
|
||||||
|
let fmax = shared.by_name("fmax");
|
||||||
|
let fmin = shared.by_name("fmin");
|
||||||
let fmul = shared.by_name("fmul");
|
let fmul = shared.by_name("fmul");
|
||||||
let fpromote = shared.by_name("fpromote");
|
let fpromote = shared.by_name("fpromote");
|
||||||
let fsub = shared.by_name("fsub");
|
let fsub = shared.by_name("fsub");
|
||||||
@@ -2081,6 +2083,29 @@ pub(crate) fn define(
|
|||||||
rec_pfcmp.opcodes(&CMPPD),
|
rec_pfcmp.opcodes(&CMPPD),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// SIMD float arithmetic
|
||||||
|
for (ty, inst, opcodes) in &[
|
||||||
|
(F32, fadd, &ADDPS[..]),
|
||||||
|
(F64, fadd, &ADDPD[..]),
|
||||||
|
(F32, fsub, &SUBPS[..]),
|
||||||
|
(F64, fsub, &SUBPD[..]),
|
||||||
|
(F32, fmul, &MULPS[..]),
|
||||||
|
(F64, fmul, &MULPD[..]),
|
||||||
|
(F32, fdiv, &DIVPS[..]),
|
||||||
|
(F64, fdiv, &DIVPD[..]),
|
||||||
|
(F32, fmin, &MINPS[..]),
|
||||||
|
(F64, fmin, &MINPD[..]),
|
||||||
|
(F32, fmax, &MAXPS[..]),
|
||||||
|
(F64, fmax, &MAXPD[..]),
|
||||||
|
] {
|
||||||
|
let inst_ = inst.bind(vector(*ty, sse_vector_size));
|
||||||
|
e.enc_both(inst_, rec_fa.opcodes(opcodes));
|
||||||
|
}
|
||||||
|
for (ty, inst, opcodes) in &[(F32, sqrt, &SQRTPS[..]), (F64, sqrt, &SQRTPD[..])] {
|
||||||
|
let inst_ = inst.bind(vector(*ty, sse_vector_size));
|
||||||
|
e.enc_both(inst_, rec_furm.opcodes(opcodes));
|
||||||
|
}
|
||||||
|
|
||||||
// Reference type instructions
|
// Reference type instructions
|
||||||
|
|
||||||
// Null references implemented as iconst 0.
|
// Null references implemented as iconst 0.
|
||||||
|
|||||||
@@ -15,6 +15,14 @@ pub static ADD_IMM: [u8; 1] = [0x81];
|
|||||||
/// Add sign-extended imm8 to r/m{16,32,64}.
|
/// Add sign-extended imm8 to r/m{16,32,64}.
|
||||||
pub static ADD_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
|
pub static ADD_IMM8_SIGN_EXTEND: [u8; 1] = [0x83];
|
||||||
|
|
||||||
|
/// Add packed double-precision floating-point values from xmm2/mem to xmm1 and store result in
|
||||||
|
/// xmm1 (SSE2).
|
||||||
|
pub static ADDPD: [u8; 3] = [0x66, 0x0f, 0x58];
|
||||||
|
|
||||||
|
/// Add packed single-precision floating-point values from xmm2/mem to xmm1 and store result in
|
||||||
|
/// xmm1 (SSE).
|
||||||
|
pub static ADDPS: [u8; 2] = [0x0f, 0x58];
|
||||||
|
|
||||||
/// Add the low double-precision floating-point value from xmm2/mem to xmm1
|
/// Add the low double-precision floating-point value from xmm2/mem to xmm1
|
||||||
/// and store the result in xmm1.
|
/// and store the result in xmm1.
|
||||||
pub static ADDSD: [u8; 3] = [0xf2, 0x0f, 0x58];
|
pub static ADDSD: [u8; 3] = [0xf2, 0x0f, 0x58];
|
||||||
@@ -93,6 +101,14 @@ pub static CVTTSS2SI: [u8; 3] = [0xf3, 0x0f, 0x2c];
|
|||||||
/// Unsigned divide for {16,32,64}-bit.
|
/// Unsigned divide for {16,32,64}-bit.
|
||||||
pub static DIV: [u8; 1] = [0xf7];
|
pub static DIV: [u8; 1] = [0xf7];
|
||||||
|
|
||||||
|
/// Divide packed double-precision floating-point values in xmm1 by packed double-precision
|
||||||
|
/// floating-point values in xmm2/mem (SSE2).
|
||||||
|
pub static DIVPD: [u8; 3] = [0x66, 0x0f, 0x5e];
|
||||||
|
|
||||||
|
/// Divide packed single-precision floating-point values in xmm1 by packed single-precision
|
||||||
|
/// floating-point values in xmm2/mem (SSE).
|
||||||
|
pub static DIVPS: [u8; 2] = [0x0f, 0x5e];
|
||||||
|
|
||||||
/// Divide low double-precision floating-point value in xmm1 by low double-precision
|
/// Divide low double-precision floating-point value in xmm1 by low double-precision
|
||||||
/// floating-point value in xmm2/m64.
|
/// floating-point value in xmm2/m64.
|
||||||
pub static DIVSD: [u8; 3] = [0xf2, 0x0f, 0x5e];
|
pub static DIVSD: [u8; 3] = [0xf2, 0x0f, 0x5e];
|
||||||
@@ -142,6 +158,14 @@ pub static LEA: [u8; 1] = [0x8d];
|
|||||||
/// Count the number of leading zero bits.
|
/// Count the number of leading zero bits.
|
||||||
pub static LZCNT: [u8; 3] = [0xf3, 0x0f, 0xbd];
|
pub static LZCNT: [u8; 3] = [0xf3, 0x0f, 0xbd];
|
||||||
|
|
||||||
|
/// Return the maximum packed double-precision floating-point values between xmm1 and xmm2/m128
|
||||||
|
/// (SSE2).
|
||||||
|
pub static MAXPD: [u8; 3] = [0x66, 0x0f, 0x5f];
|
||||||
|
|
||||||
|
/// Return the maximum packed single-precision floating-point values between xmm1 and xmm2/m128
|
||||||
|
/// (SSE).
|
||||||
|
pub static MAXPS: [u8; 2] = [0x0f, 0x5f];
|
||||||
|
|
||||||
/// Return the maximum scalar double-precision floating-point value between
|
/// Return the maximum scalar double-precision floating-point value between
|
||||||
/// xmm2/m64 and xmm1.
|
/// xmm2/m64 and xmm1.
|
||||||
pub static MAXSD: [u8; 3] = [0xf2, 0x0f, 0x5f];
|
pub static MAXSD: [u8; 3] = [0xf2, 0x0f, 0x5f];
|
||||||
@@ -150,6 +174,14 @@ pub static MAXSD: [u8; 3] = [0xf2, 0x0f, 0x5f];
|
|||||||
/// xmm2/m32 and xmm1.
|
/// xmm2/m32 and xmm1.
|
||||||
pub static MAXSS: [u8; 3] = [0xf3, 0x0f, 0x5f];
|
pub static MAXSS: [u8; 3] = [0xf3, 0x0f, 0x5f];
|
||||||
|
|
||||||
|
/// Return the minimum packed double-precision floating-point values between xmm1 and xmm2/m128
|
||||||
|
/// (SSE2).
|
||||||
|
pub static MINPD: [u8; 3] = [0x66, 0x0f, 0x5d];
|
||||||
|
|
||||||
|
/// Return the minimum packed single-precision floating-point values between xmm1 and xmm2/m128
|
||||||
|
/// (SSE).
|
||||||
|
pub static MINPS: [u8; 2] = [0x0f, 0x5d];
|
||||||
|
|
||||||
/// Return the minimum scalar double-precision floating-point value between
|
/// Return the minimum scalar double-precision floating-point value between
|
||||||
/// xmm2/m64 and xmm1.
|
/// xmm2/m64 and xmm1.
|
||||||
pub static MINSD: [u8; 3] = [0xf2, 0x0f, 0x5d];
|
pub static MINSD: [u8; 3] = [0xf2, 0x0f, 0x5d];
|
||||||
@@ -224,6 +256,14 @@ pub static MOVZX_WORD: [u8; 2] = [0x0f, 0xb7];
|
|||||||
/// Unsigned multiply for {16,32,64}-bit.
|
/// Unsigned multiply for {16,32,64}-bit.
|
||||||
pub static MUL: [u8; 1] = [0xf7];
|
pub static MUL: [u8; 1] = [0xf7];
|
||||||
|
|
||||||
|
/// Multiply packed double-precision floating-point values from xmm2/mem to xmm1 and store result
|
||||||
|
/// in xmm1 (SSE2).
|
||||||
|
pub static MULPD: [u8; 3] = [0x66, 0x0f, 0x59];
|
||||||
|
|
||||||
|
/// Multiply packed single-precision floating-point values from xmm2/mem to xmm1 and store result
|
||||||
|
/// in xmm1 (SSE).
|
||||||
|
pub static MULPS: [u8; 2] = [0x0f, 0x59];
|
||||||
|
|
||||||
/// Multiply the low double-precision floating-point value in xmm2/m64 by the
|
/// Multiply the low double-precision floating-point value in xmm2/m64 by the
|
||||||
/// low double-precision floating-point value in xmm1.
|
/// low double-precision floating-point value in xmm1.
|
||||||
pub static MULSD: [u8; 3] = [0xf2, 0x0f, 0x59];
|
pub static MULSD: [u8; 3] = [0xf2, 0x0f, 0x59];
|
||||||
@@ -474,6 +514,14 @@ pub static SBB: [u8; 1] = [0x19];
|
|||||||
/// Set byte if overflow (OF=1).
|
/// Set byte if overflow (OF=1).
|
||||||
pub static SET_BYTE_IF_OVERFLOW: [u8; 2] = [0x0f, 0x90];
|
pub static SET_BYTE_IF_OVERFLOW: [u8; 2] = [0x0f, 0x90];
|
||||||
|
|
||||||
|
/// Compute the square root of the packed double-precision floating-point values and store the
|
||||||
|
/// result in xmm1 (SSE2).
|
||||||
|
pub static SQRTPD: [u8; 3] = [0x66, 0x0f, 0x51];
|
||||||
|
|
||||||
|
/// Compute the square root of the packed double-precision floating-point values and store the
|
||||||
|
/// result in xmm1 (SSE).
|
||||||
|
pub static SQRTPS: [u8; 2] = [0x0f, 0x51];
|
||||||
|
|
||||||
/// Compute square root of scalar double-precision floating-point value.
|
/// Compute square root of scalar double-precision floating-point value.
|
||||||
pub static SQRTSD: [u8; 3] = [0xf2, 0x0f, 0x51];
|
pub static SQRTSD: [u8; 3] = [0xf2, 0x0f, 0x51];
|
||||||
|
|
||||||
@@ -483,6 +531,14 @@ pub static SQRTSS: [u8; 3] = [0xf3, 0x0f, 0x51];
|
|||||||
/// Subtract r{16,32,64} from r/m of same size.
|
/// Subtract r{16,32,64} from r/m of same size.
|
||||||
pub static SUB: [u8; 1] = [0x29];
|
pub static SUB: [u8; 1] = [0x29];
|
||||||
|
|
||||||
|
/// Subtract packed double-precision floating-point values in xmm2/mem from xmm1 and store result
|
||||||
|
/// in xmm1 (SSE2).
|
||||||
|
pub static SUBPD: [u8; 3] = [0x66, 0x0f, 0x5c];
|
||||||
|
|
||||||
|
/// Subtract packed single-precision floating-point values in xmm2/mem from xmm1 and store result
|
||||||
|
/// in xmm1 (SSE).
|
||||||
|
pub static SUBPS: [u8; 2] = [0x0f, 0x5c];
|
||||||
|
|
||||||
/// Subtract the low double-precision floating-point value in xmm2/m64 from xmm1
|
/// Subtract the low double-precision floating-point value in xmm2/m64 from xmm1
|
||||||
/// and store the result in xmm1.
|
/// and store the result in xmm1.
|
||||||
pub static SUBSD: [u8; 3] = [0xf2, 0x0f, 0x5c];
|
pub static SUBSD: [u8; 3] = [0xf2, 0x0f, 0x5c];
|
||||||
|
|||||||
@@ -173,3 +173,27 @@ ebb0:
|
|||||||
[-, %xmm3] v3 = usub_sat v0, v1 ; bin: 66 0f d9 dd
|
[-, %xmm3] v3 = usub_sat v0, v1 ; bin: 66 0f d9 dd
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function %float_arithmetic_f32x4(f32x4, f32x4) {
|
||||||
|
ebb0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm5]):
|
||||||
|
[-, %xmm3] v2 = fadd v0, v1 ; bin: 40 0f 58 dd
|
||||||
|
[-, %xmm3] v3 = fsub v0, v1 ; bin: 40 0f 5c dd
|
||||||
|
[-, %xmm3] v4 = fmul v0, v1 ; bin: 40 0f 59 dd
|
||||||
|
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 40 0f 5e dd
|
||||||
|
[-, %xmm3] v6 = fmin v0, v1 ; bin: 40 0f 5d dd
|
||||||
|
[-, %xmm3] v7 = fmax v0, v1 ; bin: 40 0f 5f dd
|
||||||
|
[-, %xmm3] v8 = sqrt v0 ; bin: 40 0f 51 db
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
function %float_arithmetic_f64x2(f64x2, f64x2) {
|
||||||
|
ebb0(v0: f64x2 [%xmm3], v1: f64x2 [%xmm5]):
|
||||||
|
[-, %xmm3] v2 = fadd v0, v1 ; bin: 66 40 0f 58 dd
|
||||||
|
[-, %xmm3] v3 = fsub v0, v1 ; bin: 66 40 0f 5c dd
|
||||||
|
[-, %xmm3] v4 = fmul v0, v1 ; bin: 66 40 0f 59 dd
|
||||||
|
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 66 40 0f 5e dd
|
||||||
|
[-, %xmm3] v6 = fmin v0, v1 ; bin: 66 40 0f 5d dd
|
||||||
|
[-, %xmm3] v7 = fmax v0, v1 ; bin: 66 40 0f 5f dd
|
||||||
|
[-, %xmm3] v8 = sqrt v0 ; bin: 66 40 0f 51 db
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|||||||
@@ -153,3 +153,77 @@ ebb0:
|
|||||||
return v8
|
return v8
|
||||||
}
|
}
|
||||||
; run
|
; run
|
||||||
|
|
||||||
|
function %add_sub_f32x4() -> b1 {
|
||||||
|
ebb0:
|
||||||
|
v0 = vconst.f32x4 [0x4.2 0.0 0.0 0.0]
|
||||||
|
v1 = vconst.f32x4 [0x1.0 0x1.0 0x1.0 0x1.0]
|
||||||
|
v2 = vconst.f32x4 [0x5.2 0x1.0 0x1.0 0x1.0]
|
||||||
|
|
||||||
|
v3 = fadd v0, v1
|
||||||
|
v4 = fcmp eq v3, v2
|
||||||
|
|
||||||
|
v6 = fsub v2, v1
|
||||||
|
v7 = fcmp eq v6, v0
|
||||||
|
|
||||||
|
v8 = band v4, v7
|
||||||
|
v9 = vall_true v8
|
||||||
|
return v9
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|
||||||
|
function %mul_div_f32x4() -> b1 {
|
||||||
|
ebb0:
|
||||||
|
v0 = vconst.f32x4 [0x4.2 -0x2.1 0x2.0 0.0]
|
||||||
|
v1 = vconst.f32x4 [0x3.4 0x6.7 0x8.9 0xa.b]
|
||||||
|
v2 = vconst.f32x4 [0xd.68 -0xd.47 0x11.2 0x0.0]
|
||||||
|
|
||||||
|
v3 = fmul v0, v1
|
||||||
|
v4 = fcmp eq v3, v2
|
||||||
|
|
||||||
|
v6 = fdiv v2, v1
|
||||||
|
v7 = fcmp eq v6, v0
|
||||||
|
|
||||||
|
v8 = band v4, v7
|
||||||
|
v9 = vall_true v8
|
||||||
|
return v9
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|
||||||
|
function %sqrt_f64x2() -> b1 {
|
||||||
|
ebb0:
|
||||||
|
v0 = vconst.f64x2 [0x9.0 0x1.0]
|
||||||
|
v1 = sqrt v0
|
||||||
|
v2 = vconst.f64x2 [0x3.0 0x1.0]
|
||||||
|
v3 = fcmp eq v2, v1
|
||||||
|
v4 = vall_true v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|
||||||
|
function %fmax_f64x2() -> b1 {
|
||||||
|
ebb0:
|
||||||
|
v0 = vconst.f64x2 [-0.0 -0x1.0]
|
||||||
|
v1 = vconst.f64x2 [+0.0 +0x1.0]
|
||||||
|
|
||||||
|
v2 = fmax v0, v1
|
||||||
|
v3 = fcmp eq v2, v1
|
||||||
|
v4 = vall_true v3
|
||||||
|
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|
||||||
|
|
||||||
|
function %fmin_f64x2() -> b1 {
|
||||||
|
ebb0:
|
||||||
|
v0 = vconst.f64x2 [-0x1.0 -0x1.0]
|
||||||
|
v1 = vconst.f64x2 [+0.0 +0x1.0]
|
||||||
|
|
||||||
|
v2 = fmin v0, v1
|
||||||
|
v3 = fcmp eq v2, v0
|
||||||
|
v4 = vall_true v3
|
||||||
|
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
; run
|
||||||
|
|||||||
Reference in New Issue
Block a user