diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index e2f6246ed6..bc0e26b87e 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -417,6 +417,8 @@ pub(crate) fn define( let fill = shared.by_name("fill"); let fill_nop = shared.by_name("fill_nop"); let floor = shared.by_name("floor"); + let fmax = shared.by_name("fmax"); + let fmin = shared.by_name("fmin"); let fmul = shared.by_name("fmul"); let fpromote = shared.by_name("fpromote"); let fsub = shared.by_name("fsub"); @@ -2081,6 +2083,29 @@ pub(crate) fn define( rec_pfcmp.opcodes(&CMPPD), ); + // SIMD float arithmetic + for (ty, inst, opcodes) in &[ + (F32, fadd, &ADDPS[..]), + (F64, fadd, &ADDPD[..]), + (F32, fsub, &SUBPS[..]), + (F64, fsub, &SUBPD[..]), + (F32, fmul, &MULPS[..]), + (F64, fmul, &MULPD[..]), + (F32, fdiv, &DIVPS[..]), + (F64, fdiv, &DIVPD[..]), + (F32, fmin, &MINPS[..]), + (F64, fmin, &MINPD[..]), + (F32, fmax, &MAXPS[..]), + (F64, fmax, &MAXPD[..]), + ] { + let inst_ = inst.bind(vector(*ty, sse_vector_size)); + e.enc_both(inst_, rec_fa.opcodes(opcodes)); + } + for (ty, inst, opcodes) in &[(F32, sqrt, &SQRTPS[..]), (F64, sqrt, &SQRTPD[..])] { + let inst_ = inst.bind(vector(*ty, sse_vector_size)); + e.enc_both(inst_, rec_furm.opcodes(opcodes)); + } + // Reference type instructions // Null references implemented as iconst 0. diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index fde15899e7..a0d9c8d9c6 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -15,6 +15,14 @@ pub static ADD_IMM: [u8; 1] = [0x81]; /// Add sign-extended imm8 to r/m{16,32,64}. pub static ADD_IMM8_SIGN_EXTEND: [u8; 1] = [0x83]; +/// Add packed double-precision floating-point values from xmm2/mem to xmm1 and store result in +/// xmm1 (SSE2). +pub static ADDPD: [u8; 3] = [0x66, 0x0f, 0x58]; + +/// Add packed single-precision floating-point values from xmm2/mem to xmm1 and store result in +/// xmm1 (SSE). +pub static ADDPS: [u8; 2] = [0x0f, 0x58]; + /// Add the low double-precision floating-point value from xmm2/mem to xmm1 /// and store the result in xmm1. pub static ADDSD: [u8; 3] = [0xf2, 0x0f, 0x58]; @@ -93,6 +101,14 @@ pub static CVTTSS2SI: [u8; 3] = [0xf3, 0x0f, 0x2c]; /// Unsigned divide for {16,32,64}-bit. pub static DIV: [u8; 1] = [0xf7]; +/// Divide packed double-precision floating-point values in xmm1 by packed double-precision +/// floating-point values in xmm2/mem (SSE2). +pub static DIVPD: [u8; 3] = [0x66, 0x0f, 0x5e]; + +/// Divide packed single-precision floating-point values in xmm1 by packed single-precision +/// floating-point values in xmm2/mem (SSE). +pub static DIVPS: [u8; 2] = [0x0f, 0x5e]; + /// Divide low double-precision floating-point value in xmm1 by low double-precision /// floating-point value in xmm2/m64. pub static DIVSD: [u8; 3] = [0xf2, 0x0f, 0x5e]; @@ -142,6 +158,14 @@ pub static LEA: [u8; 1] = [0x8d]; /// Count the number of leading zero bits. pub static LZCNT: [u8; 3] = [0xf3, 0x0f, 0xbd]; +/// Return the maximum packed double-precision floating-point values between xmm1 and xmm2/m128 +/// (SSE2). +pub static MAXPD: [u8; 3] = [0x66, 0x0f, 0x5f]; + +/// Return the maximum packed single-precision floating-point values between xmm1 and xmm2/m128 +/// (SSE). +pub static MAXPS: [u8; 2] = [0x0f, 0x5f]; + /// Return the maximum scalar double-precision floating-point value between /// xmm2/m64 and xmm1. pub static MAXSD: [u8; 3] = [0xf2, 0x0f, 0x5f]; @@ -150,6 +174,14 @@ pub static MAXSD: [u8; 3] = [0xf2, 0x0f, 0x5f]; /// xmm2/m32 and xmm1. pub static MAXSS: [u8; 3] = [0xf3, 0x0f, 0x5f]; +/// Return the minimum packed double-precision floating-point values between xmm1 and xmm2/m128 +/// (SSE2). +pub static MINPD: [u8; 3] = [0x66, 0x0f, 0x5d]; + +/// Return the minimum packed single-precision floating-point values between xmm1 and xmm2/m128 +/// (SSE). +pub static MINPS: [u8; 2] = [0x0f, 0x5d]; + /// Return the minimum scalar double-precision floating-point value between /// xmm2/m64 and xmm1. pub static MINSD: [u8; 3] = [0xf2, 0x0f, 0x5d]; @@ -224,6 +256,14 @@ pub static MOVZX_WORD: [u8; 2] = [0x0f, 0xb7]; /// Unsigned multiply for {16,32,64}-bit. pub static MUL: [u8; 1] = [0xf7]; +/// Multiply packed double-precision floating-point values from xmm2/mem to xmm1 and store result +/// in xmm1 (SSE2). +pub static MULPD: [u8; 3] = [0x66, 0x0f, 0x59]; + +/// Multiply packed single-precision floating-point values from xmm2/mem to xmm1 and store result +/// in xmm1 (SSE). +pub static MULPS: [u8; 2] = [0x0f, 0x59]; + /// Multiply the low double-precision floating-point value in xmm2/m64 by the /// low double-precision floating-point value in xmm1. pub static MULSD: [u8; 3] = [0xf2, 0x0f, 0x59]; @@ -474,6 +514,14 @@ pub static SBB: [u8; 1] = [0x19]; /// Set byte if overflow (OF=1). pub static SET_BYTE_IF_OVERFLOW: [u8; 2] = [0x0f, 0x90]; +/// Compute the square root of the packed double-precision floating-point values and store the +/// result in xmm1 (SSE2). +pub static SQRTPD: [u8; 3] = [0x66, 0x0f, 0x51]; + +/// Compute the square root of the packed double-precision floating-point values and store the +/// result in xmm1 (SSE). +pub static SQRTPS: [u8; 2] = [0x0f, 0x51]; + /// Compute square root of scalar double-precision floating-point value. pub static SQRTSD: [u8; 3] = [0xf2, 0x0f, 0x51]; @@ -483,6 +531,14 @@ pub static SQRTSS: [u8; 3] = [0xf3, 0x0f, 0x51]; /// Subtract r{16,32,64} from r/m of same size. pub static SUB: [u8; 1] = [0x29]; +/// Subtract packed double-precision floating-point values in xmm2/mem from xmm1 and store result +/// in xmm1 (SSE2). +pub static SUBPD: [u8; 3] = [0x66, 0x0f, 0x5c]; + +/// Subtract packed single-precision floating-point values in xmm2/mem from xmm1 and store result +/// in xmm1 (SSE). +pub static SUBPS: [u8; 2] = [0x0f, 0x5c]; + /// Subtract the low double-precision floating-point value in xmm2/m64 from xmm1 /// and store the result in xmm1. pub static SUBSD: [u8; 3] = [0xf2, 0x0f, 0x5c]; diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif index cc2d7f03e1..2994d36146 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif @@ -173,3 +173,27 @@ ebb0: [-, %xmm3] v3 = usub_sat v0, v1 ; bin: 66 0f d9 dd return } + +function %float_arithmetic_f32x4(f32x4, f32x4) { +ebb0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm5]): +[-, %xmm3] v2 = fadd v0, v1 ; bin: 40 0f 58 dd +[-, %xmm3] v3 = fsub v0, v1 ; bin: 40 0f 5c dd +[-, %xmm3] v4 = fmul v0, v1 ; bin: 40 0f 59 dd +[-, %xmm3] v5 = fdiv v0, v1 ; bin: 40 0f 5e dd +[-, %xmm3] v6 = fmin v0, v1 ; bin: 40 0f 5d dd +[-, %xmm3] v7 = fmax v0, v1 ; bin: 40 0f 5f dd +[-, %xmm3] v8 = sqrt v0 ; bin: 40 0f 51 db + return +} + +function %float_arithmetic_f64x2(f64x2, f64x2) { +ebb0(v0: f64x2 [%xmm3], v1: f64x2 [%xmm5]): +[-, %xmm3] v2 = fadd v0, v1 ; bin: 66 40 0f 58 dd +[-, %xmm3] v3 = fsub v0, v1 ; bin: 66 40 0f 5c dd +[-, %xmm3] v4 = fmul v0, v1 ; bin: 66 40 0f 59 dd +[-, %xmm3] v5 = fdiv v0, v1 ; bin: 66 40 0f 5e dd +[-, %xmm3] v6 = fmin v0, v1 ; bin: 66 40 0f 5d dd +[-, %xmm3] v7 = fmax v0, v1 ; bin: 66 40 0f 5f dd +[-, %xmm3] v8 = sqrt v0 ; bin: 66 40 0f 51 db + return +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif index 22bcf11bdd..9fa569ac28 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif @@ -153,3 +153,77 @@ ebb0: return v8 } ; run + +function %add_sub_f32x4() -> b1 { +ebb0: + v0 = vconst.f32x4 [0x4.2 0.0 0.0 0.0] + v1 = vconst.f32x4 [0x1.0 0x1.0 0x1.0 0x1.0] + v2 = vconst.f32x4 [0x5.2 0x1.0 0x1.0 0x1.0] + + v3 = fadd v0, v1 + v4 = fcmp eq v3, v2 + + v6 = fsub v2, v1 + v7 = fcmp eq v6, v0 + + v8 = band v4, v7 + v9 = vall_true v8 + return v9 +} +; run + +function %mul_div_f32x4() -> b1 { +ebb0: + v0 = vconst.f32x4 [0x4.2 -0x2.1 0x2.0 0.0] + v1 = vconst.f32x4 [0x3.4 0x6.7 0x8.9 0xa.b] + v2 = vconst.f32x4 [0xd.68 -0xd.47 0x11.2 0x0.0] + + v3 = fmul v0, v1 + v4 = fcmp eq v3, v2 + + v6 = fdiv v2, v1 + v7 = fcmp eq v6, v0 + + v8 = band v4, v7 + v9 = vall_true v8 + return v9 +} +; run + +function %sqrt_f64x2() -> b1 { +ebb0: + v0 = vconst.f64x2 [0x9.0 0x1.0] + v1 = sqrt v0 + v2 = vconst.f64x2 [0x3.0 0x1.0] + v3 = fcmp eq v2, v1 + v4 = vall_true v3 + return v4 +} +; run + +function %fmax_f64x2() -> b1 { +ebb0: + v0 = vconst.f64x2 [-0.0 -0x1.0] + v1 = vconst.f64x2 [+0.0 +0x1.0] + + v2 = fmax v0, v1 + v3 = fcmp eq v2, v1 + v4 = vall_true v3 + + return v4 +} +; run + + +function %fmin_f64x2() -> b1 { +ebb0: + v0 = vconst.f64x2 [-0x1.0 -0x1.0] + v1 = vconst.f64x2 [+0.0 +0x1.0] + + v2 = fmin v0, v1 + v3 = fcmp eq v2, v0 + v4 = vall_true v3 + + return v4 +} +; run