From 8cfff2695713bce54c465a16005663aa143c7385 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 4 Aug 2020 11:12:01 -0700
Subject: [PATCH] machinst x64: implement floating point comparisons

Note that this fixes an encoding issue in which the packed single and packed double prefixes were flipped.
---
 build.rs                                      |   2 +
 cranelift/codegen/src/isa/x64/inst/emit.rs    |   4 +-
 .../codegen/src/isa/x64/inst/emit_tests.rs    |  13 ++
 cranelift/codegen/src/isa/x64/lower.rs        | 205 +++++++++++-------
 4 files changed, 142 insertions(+), 82 deletions(-)
diff --git a/build.rs b/build.rs
index 200f532b2a..42e152604a 100644
--- a/build.rs
+++ b/build.rs
@@ -182,7 +182,9 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str
     match (testsuite, testname) {
         ("simd", "simd_address") => return false,
         ("simd", "simd_f32x4_arith") => return false,
+        ("simd", "simd_f32x4_cmp") => return false,
         ("simd", "simd_f64x2_arith") => return false,
+        ("simd", "simd_f64x2_cmp") => return false,
         ("simd", "simd_store") => return false,
         ("simd", _) => return true,
         _ => {}
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 26fd5a8947..764d34af6d 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1767,8 +1767,8 @@ pub(crate) fn emit(
 
         Inst::XmmRmRImm { op, src, dst, imm } => {
             let prefix = match op {
-                SseOpcode::Cmpps => LegacyPrefix::_66,
-                SseOpcode::Cmppd => LegacyPrefix::None,
+                SseOpcode::Cmpps => LegacyPrefix::None,
+                SseOpcode::Cmppd => LegacyPrefix::_66,
                 SseOpcode::Cmpss => LegacyPrefix::_F3,
                 SseOpcode::Cmpsd => LegacyPrefix::_F2,
                 _ => unimplemented!("Opcode {:?} not implemented", op),
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index b966398804..e0f26c1966 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3190,6 +3190,19 @@ fn test_x64_emit() {
         "psrlq   $1, %xmm3",
     ));
 
+    // ========================================================
+    // XmmRmRImm
+    insns.push((
+        Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2),
+        "660FC2CD02",
+        "cmppd   $2, %xmm5, %xmm1",
+    ));
+    insns.push((
+        Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0),
+        "410FC2FF00",
+        "cmpps   $0, %xmm15, %xmm7",
+    ));
+
     // ========================================================
     // Misc instructions.
 
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 2d4a22933e..b84680245c 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -855,92 +855,137 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Fcmp => {
             let condcode = inst_fp_condcode(ctx.data(insn)).unwrap();
             let input_ty = ctx.input_ty(insn, 0);
-            let op = match input_ty {
-                F32 => SseOpcode::Ucomiss,
-                F64 => SseOpcode::Ucomisd,
-                _ => panic!("Bad input type to Fcmp"),
-            };
+            if !input_ty.is_vector() {
+                let op = match input_ty {
+                    F32 => SseOpcode::Ucomiss,
+                    F64 => SseOpcode::Ucomisd,
+                    _ => panic!("Bad input type to fcmp: {}", input_ty),
+                };
 
-            // Unordered is returned by setting ZF, PF, CF <- 111
-            // Greater than by ZF, PF, CF <- 000
-            // Less than by ZF, PF, CF <- 001
-            // Equal by ZF, PF, CF <- 100
-            //
-            // Checking the result of comiss is somewhat annoying because you don't have setcc
-            // instructions that explicitly check simultaneously for the condition (i.e. eq, le,
-            // gt, etc) *and* orderedness.
-            //
-            // So that might mean we need more than one setcc check and then a logical "and" or
-            // "or" to determine both, in some cases.  However knowing that if the parity bit is
-            // set, then the result was considered unordered and knowing that if the parity bit is
-            // set, then both the ZF and CF flag bits must also be set we can get away with using
-            // one setcc for most condition codes.
+                // Unordered is returned by setting ZF, PF, CF <- 111
+                // Greater than by ZF, PF, CF <- 000
+                // Less than by ZF, PF, CF <- 001
+                // Equal by ZF, PF, CF <- 100
+                //
+                // Checking the result of comiss is somewhat annoying because you don't have setcc
+                // instructions that explicitly check simultaneously for the condition (i.e. eq, le,
+                // gt, etc) *and* orderedness.
+                //
+                // So that might mean we need more than one setcc check and then a logical "and" or
+                // "or" to determine both, in some cases.  However knowing that if the parity bit is
+                // set, then the result was considered unordered and knowing that if the parity bit is
+                // set, then both the ZF and CF flag bits must also be set we can get away with using
+                // one setcc for most condition codes.
 
-            match condcode {
-                FloatCC::LessThan
-                | FloatCC::LessThanOrEqual
-                | FloatCC::UnorderedOrGreaterThan
-                | FloatCC::UnorderedOrGreaterThanOrEqual => {
-                    // setb and setbe for ordered LessThan and LessThanOrEqual check if CF = 1
-                    // which doesn't exclude unorderdness. To get around this we can reverse the
-                    // operands and the cc test to instead check if CF and ZF are 0 which would
-                    // also excludes unorderedness. Using similiar logic we also reverse
-                    // UnorderedOrGreaterThan and UnorderedOrGreaterThanOrEqual and assure that ZF
-                    // or CF is 1 to exclude orderedness.
-                    let lhs = input_to_reg_mem(ctx, inputs[0]);
-                    let rhs = input_to_reg(ctx, inputs[1]);
-                    let dst = output_to_reg(ctx, outputs[0]);
-                    ctx.emit(Inst::xmm_cmp_rm_r(op, lhs, rhs));
-                    let condcode = condcode.reverse();
-                    let cc = CC::from_floatcc(condcode);
-                    ctx.emit(Inst::setcc(cc, dst));
+                match condcode {
+                    FloatCC::LessThan
+                    | FloatCC::LessThanOrEqual
+                    | FloatCC::UnorderedOrGreaterThan
+                    | FloatCC::UnorderedOrGreaterThanOrEqual => {
+                        // setb and setbe for ordered LessThan and LessThanOrEqual check if CF = 1
+                        // which doesn't exclude unorderdness. To get around this we can reverse the
+                        // operands and the cc test to instead check if CF and ZF are 0 which would
+                        // also excludes unorderedness. Using similiar logic we also reverse
+                        // UnorderedOrGreaterThan and UnorderedOrGreaterThanOrEqual and assure that ZF
+                        // or CF is 1 to exclude orderedness.
+                        let lhs = input_to_reg_mem(ctx, inputs[0]);
+                        let rhs = input_to_reg(ctx, inputs[1]);
+                        let dst = output_to_reg(ctx, outputs[0]);
+                        ctx.emit(Inst::xmm_cmp_rm_r(op, lhs, rhs));
+                        let condcode = condcode.reverse();
+                        let cc = CC::from_floatcc(condcode);
+                        ctx.emit(Inst::setcc(cc, dst));
+                    }
+
+                    FloatCC::Equal => {
+                        // Outlier case: equal means both the operands are ordered and equal; we cannot
+                        // get around checking the parity bit to determine if the result was ordered.
+                        let lhs = input_to_reg(ctx, inputs[0]);
+                        let rhs = input_to_reg_mem(ctx, inputs[1]);
+                        let dst = output_to_reg(ctx, outputs[0]);
+                        let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
+                        ctx.emit(Inst::setcc(CC::NP, tmp_gpr1));
+                        ctx.emit(Inst::setcc(CC::Z, dst));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false,
+                            AluRmiROpcode::And,
+                            RegMemImm::reg(tmp_gpr1.to_reg()),
+                            dst,
+                        ));
+                    }
+
+                    FloatCC::NotEqual => {
+                        // Outlier case: not equal means either the operands are unordered, or they're
+                        // not the same value.
+                        let lhs = input_to_reg(ctx, inputs[0]);
+                        let rhs = input_to_reg_mem(ctx, inputs[1]);
+                        let dst = output_to_reg(ctx, outputs[0]);
+                        let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, I32);
+                        ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
+                        ctx.emit(Inst::setcc(CC::P, tmp_gpr1));
+                        ctx.emit(Inst::setcc(CC::NZ, dst));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false,
+                            AluRmiROpcode::Or,
+                            RegMemImm::reg(tmp_gpr1.to_reg()),
+                            dst,
+                        ));
+                    }
+
+                    _ => {
+                        // For all remaining condition codes we can handle things with one check.
+                        let lhs = input_to_reg(ctx, inputs[0]);
+                        let rhs = input_to_reg_mem(ctx, inputs[1]);
+                        let dst = output_to_reg(ctx, outputs[0]);
+                        let cc = CC::from_floatcc(condcode);
+                        ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
+                        ctx.emit(Inst::setcc(cc, dst));
+                    }
                 }
+            } else {
+                let op = match input_ty {
+                    types::F32X4 => SseOpcode::Cmpps,
+                    types::F64X2 => SseOpcode::Cmppd,
+                    _ => panic!("Bad input type to fcmp: {}", input_ty),
+                };
 
-                FloatCC::Equal => {
-                    // Outlier case: equal means both the operands are ordered and equal; we cannot
-                    // get around checking the parity bit to determine if the result was ordered.
-                    let lhs = input_to_reg(ctx, inputs[0]);
-                    let rhs = input_to_reg_mem(ctx, inputs[1]);
-                    let dst = output_to_reg(ctx, outputs[0]);
-                    let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, I32);
-                    ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
-                    ctx.emit(Inst::setcc(CC::NP, tmp_gpr1));
-                    ctx.emit(Inst::setcc(CC::Z, dst));
-                    ctx.emit(Inst::alu_rmi_r(
-                        false,
-                        AluRmiROpcode::And,
-                        RegMemImm::reg(tmp_gpr1.to_reg()),
-                        dst,
-                    ));
-                }
+                // Since some packed comparisons are not available, some of the condition codes
+                // must be inverted, with a corresponding `flip` of the operands.
+                let (imm, flip) = match condcode {
+                    FloatCC::GreaterThan => (FcmpImm::LessThan, true),
+                    FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true),
+                    FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true),
+                    FloatCC::UnorderedOrLessThanOrEqual => {
+                        (FcmpImm::UnorderedOrGreaterThanOrEqual, true)
+                    }
+                    FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => {
+                        panic!("unsupported float condition code: {}", condcode)
+                    }
+                    _ => (FcmpImm::from(condcode), false),
+                };
 
-                FloatCC::NotEqual => {
-                    // Outlier case: not equal means either the operands are unordered, or they're
-                    // not the same value.
-                    let lhs = input_to_reg(ctx, inputs[0]);
-                    let rhs = input_to_reg_mem(ctx, inputs[1]);
-                    let dst = output_to_reg(ctx, outputs[0]);
-                    let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, I32);
-                    ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
-                    ctx.emit(Inst::setcc(CC::P, tmp_gpr1));
-                    ctx.emit(Inst::setcc(CC::NZ, dst));
-                    ctx.emit(Inst::alu_rmi_r(
-                        false,
-                        AluRmiROpcode::Or,
-                        RegMemImm::reg(tmp_gpr1.to_reg()),
-                        dst,
-                    ));
-                }
+                // Determine the operands of the comparison, possibly by flipping them.
+                let (lhs, rhs) = if flip {
+                    (
+                        input_to_reg(ctx, inputs[1]),
+                        input_to_reg_mem(ctx, inputs[0]),
+                    )
+                } else {
+                    (
+                        input_to_reg(ctx, inputs[0]),
+                        input_to_reg_mem(ctx, inputs[1]),
+                    )
+                };
 
-                _ => {
-                    // For all remaining condition codes we can handle things with one check.
-                    let lhs = input_to_reg(ctx, inputs[0]);
-                    let rhs = input_to_reg_mem(ctx, inputs[1]);
-                    let dst = output_to_reg(ctx, outputs[0]);
-                    let cc = CC::from_floatcc(condcode);
-                    ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
-                    ctx.emit(Inst::setcc(cc, dst));
-                }
+                // Move the `lhs` to the same register as `dst`; this may not emit an actual move
+                // but ensures that the registers are the same to match x86's read-write operand
+                // encoding.
+                let dst = output_to_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(dst, lhs, input_ty));
+
+                // Emit the comparison.
+                ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode()));
             }
         }