Add x86 SIMD implementation of float comparison

2019-10-28 13:26:21 -07:00
parent e5a36e2c61
commit d32301854d
5 changed files with 122 additions and 0 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -610,6 +610,7 @@ pub(crate) fn define(
    let rec_null_fpr = r.recipe("null_fpr");
    let rec_pcrel_fnaddr8 = r.template("pcrel_fnaddr8");
    let rec_pcrel_gvaddr8 = r.template("pcrel_gvaddr8");
+    let rec_pfcmp = r.template("pfcmp");
    let rec_popq = r.template("popq");
    let rec_pu_id = r.template("pu_id");
    let rec_pu_id_bool = r.template("pu_id_bool");
@@ -2070,6 +2071,16 @@ pub(crate) fn define(
        e.enc_32_64_maybe_isap(inst_, rec_fa.opcodes(opcodes), *isa_predicate);
    }

+    // SIMD float comparisons
+    e.enc_both(
+        fcmp.bind(vector(F32, sse_vector_size)),
+        rec_pfcmp.opcodes(&CMPPS),
+    );
+    e.enc_both(
+        fcmp.bind(vector(F64, sse_vector_size)),
+        rec_pfcmp.opcodes(&CMPPD),
+    );
+
    // Reference type instructions

    // Null references implemented as iconst 0.
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -61,6 +61,14 @@ pub static CMP_IMM8: [u8; 1] = [0x83];
 /// Compare r{16,32,64} with r/m of the same size.
 pub static CMP_REG: [u8; 1] = [0x39];

+/// Compare packed double-precision floating-point value in xmm2/m32 and xmm1 using bits 2:0 of
+/// imm8 as comparison predicate (SSE2).
+pub static CMPPD: [u8; 3] = [0x66, 0x0f, 0xc2];
+
+/// Compare packed single-precision floating-point value in xmm2/m32 and xmm1 using bits 2:0 of
+/// imm8 as comparison predicate (SSE).
+pub static CMPPS: [u8; 2] = [0x0f, 0xc2];
+
 /// Convert scalar double-precision floating-point value to scalar single-precision
 /// floating-point value.
 pub static CVTSD2SS: [u8; 3] = [0xf2, 0x0f, 0x5a];
--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -3015,6 +3015,43 @@ pub(crate) fn define<'shared>(
            ),
    );

+    {
+        let supported_floatccs: Vec<Literal> = ["eq", "lt", "le", "uno", "ne", "gt", "ge", "ord"]
+            .iter()
+            .map(|name| Literal::enumerator_for(floatcc, name))
+            .collect();
+        recipes.add_template_recipe(
+            EncodingRecipeBuilder::new("pfcmp", &formats.float_compare, 2)
+                .operands_in(vec![fpr, fpr])
+                .operands_out(vec![0])
+                .inst_predicate(supported_floatccs_predicate(
+                    &supported_floatccs[..],
+                    &*formats.float_compare,
+                ))
+                .emit(
+                    r#"
+                    // Comparison instruction.
+                    {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink);
+                    modrm_rr(in_reg1, in_reg0, sink);
+                    // Add immediate byte indicating what type of comparison.
+                    use crate::ir::condcodes::FloatCC::*;
+                    let imm = match cond {
+                        Equal               => 0x00,
+                        LessThan            => 0x01,
+                        LessThanOrEqual     => 0x02,
+                        Unordered           => 0x03,
+                        NotEqual            => 0x04,
+                        GreaterThanOrEqual  => 0x05,
+                        GreaterThan         => 0x06,
+                        Ordered             => 0x07,
+                        _ => panic!("{} not supported by pfcmp", cond),
+                    };
+                    sink.put1(imm);
+                "#,
+                ),
+        );
+    }
+
    recipes.add_template_recipe(
        EncodingRecipeBuilder::new("is_zero", &formats.unary, 2 + 2)
            .operands_in(vec![gpr])
--- a/cranelift/filetests/filetests/isa/x86/simd-comparison-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-comparison-binemit.clif
@@ -52,3 +52,29 @@ ebb0(v0: i32x4 [%xmm2], v1: i32x4 [%xmm4]):
 [-, %xmm2]  v5 = x86_pminu v0, v1     ; bin: 66 0f 38 3b d4
            return
 }
+
+function %fcmp_f32x4(f32x4, f32x4) {
+ebb0(v0: f32x4 [%xmm2], v1: f32x4 [%xmm4]):
+[-, %xmm2]  v2 = fcmp eq v0, v1     ; bin: 40 0f c2 d4 00
+[-, %xmm2]  v3 = fcmp lt v0, v1     ; bin: 40 0f c2 d4 01
+[-, %xmm2]  v4 = fcmp le v0, v1     ; bin: 40 0f c2 d4 02
+[-, %xmm2]  v5 = fcmp uno v0, v1    ; bin: 40 0f c2 d4 03
+[-, %xmm2]  v6 = fcmp ne v0, v1     ; bin: 40 0f c2 d4 04
+[-, %xmm2]  v7 = fcmp ge v0, v1     ; bin: 40 0f c2 d4 05
+[-, %xmm2]  v8 = fcmp gt v0, v1     ; bin: 40 0f c2 d4 06
+[-, %xmm2]  v9 = fcmp ord v0, v1    ; bin: 40 0f c2 d4 07
+            return
+}
+
+function %fcmp_f64x2(f64x2, f64x2) {
+ebb0(v0: f64x2 [%xmm2], v1: f64x2 [%xmm0]):
+[-, %xmm2]  v2 = fcmp eq v0, v1     ; bin: 66 40 0f c2 d0 00
+[-, %xmm2]  v3 = fcmp lt v0, v1     ; bin: 66 40 0f c2 d0 01
+[-, %xmm2]  v4 = fcmp le v0, v1     ; bin: 66 40 0f c2 d0 02
+[-, %xmm2]  v5 = fcmp uno v0, v1    ; bin: 66 40 0f c2 d0 03
+[-, %xmm2]  v6 = fcmp ne v0, v1     ; bin: 66 40 0f c2 d0 04
+[-, %xmm2]  v7 = fcmp ge v0, v1     ; bin: 66 40 0f c2 d0 05
+[-, %xmm2]  v8 = fcmp gt v0, v1     ; bin: 66 40 0f c2 d0 06
+[-, %xmm2]  v9 = fcmp ord v0, v1    ; bin: 66 40 0f c2 d0 07
+            return
+}
--- a/cranelift/filetests/filetests/isa/x86/simd-comparison-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-comparison-run.clif
@@ -177,3 +177,43 @@ ebb0:
    return v8
 }
 ; run
+
+function %fcmp_eq_f32x4() -> b1 {
+ebb0:
+    v0 = vconst.f32x4 [0.0 -0x4.2 0x0.33333 -0.0]
+    v1 = vconst.f32x4 [0.0 -0x4.2 0x0.33333 -0.0]
+    v2 = fcmp eq v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %fcmp_lt_f32x4() -> b1 {
+ebb0:
+    v0 = vconst.f32x4 [0.0      -0x4.2  0x0.0       -0.0]
+    v1 = vconst.f32x4 [0x0.001  0x4.2   0x0.33333   0x1.0]
+    v2 = fcmp lt v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %fcmp_ge_f64x2() -> b1 {
+ebb0:
+    v0 = vconst.f64x2 [0x0.0  0x4.2]
+    v1 = vconst.f64x2 [0.0    0x4.1]
+    v2 = fcmp ge v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %fcmp_uno_f64x2() -> b1 {
+ebb0:
+    v0 = vconst.f64x2 [0.0  NaN]
+    v1 = vconst.f64x2 [NaN  0x4.1]
+    v2 = fcmp uno v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run