From 96d51cb1e8223137300bd2ef5f17074443e007d6 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Mon, 30 Sep 2019 11:27:29 -0700
Subject: [PATCH] Switch x86 SIMD bor from ORPS to POR encoding

There are two reasons for this change:
 1. it reduces confusion; using the `POR` encoding will match the future encodings of `band` and `bxor` and the `ORPS` encoding may be confusing as it is intended for floating-point operations
 2. `POR` has slightly more throughput: it only has to wait 0.33 cycles to execute again on all Intel architectures above Core whereas `ORPS` must wait 1 cycle on architectures older than Skylake (Intel Optimization Reference Manual, C.3)

`POR` does add one additional byte to the encoding and requires SSE2 so the `ORPS` opcode is left in for future use.
---
 cranelift/codegen/meta/src/isa/x86/encodings.rs | 13 ++++++-------
 cranelift/codegen/meta/src/isa/x86/opcodes.rs   |  3 +++
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index c9b1ed8b42..a42c3872ef 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1874,13 +1874,6 @@ pub(crate) fn define<'defs>(
         e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
     }
 
-    // SIMD bor using ORPS
-    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
-        let instruction = bor.bind(vector(ty, sse_vector_size));
-        let template = rec_fa.nonrex().opcodes(&ORPS);
-        e.enc_32_64_maybe_isap(instruction, template, None); // from SSE
-    }
-
     // SIMD register movement: store, load, spill, fill, regmove. All of these use encodings of
     // MOVUPS and MOVAPS from SSE (TODO ideally all of these would either use MOVAPS when we have
     // alignment or type-specific encodings, see https://github.com/CraneStation/cranelift/issues/1039).
@@ -1980,6 +1973,12 @@ pub(crate) fn define<'defs>(
         e.enc_32_64_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap);
     }
 
+    // SIMD bor
+    for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
+        let bor = bor.bind(vector(ty, sse_vector_size));
+        e.enc_32_64(bor, rec_fa.nonrex().opcodes(&POR));
+    }
+
     // SIMD icmp using PCMPEQ*
     for ty in ValueType::all_lane_types().filter(|t| t.is_int() && allowed_simd_type(t)) {
         let (opcodes, isa_predicate): (&[_], _) = match ty.lane_bits() {
diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
index f81d2423ea..d315de0113 100644
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -307,6 +307,9 @@ pub static POP_REG: [u8; 1] = [0x58];
 /// Returns the count of number of bits set to 1.
 pub static POPCNT: [u8; 3] = [0xf3, 0x0f, 0xb8];
 
+/// Bitwise OR of xmm2/m128 and xmm1 (SSE2).
+pub static POR: [u8; 3] = [0x66, 0x0f, 0xeb];
+
 /// Shuffle bytes in xmm1 according to contents of xmm2/m128 (SSE3).
 pub static PSHUFB: [u8; 4] = [0x66, 0x0f, 0x38, 0x00];