From 6bf96d696393bf3eccb6e591b8b60af7097ed971 Mon Sep 17 00:00:00 2001
From: Alexis Engelke <alexis.engelke@googlemail.com>
Date: Wed, 28 Dec 2022 11:28:29 +0100
Subject: [PATCH] parseinstrs: Improve performance of superstring

This algorithm yields slightly worse results, but is substantially
faster for larger string counts.
---
 parseinstrs.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/parseinstrs.py b/parseinstrs.py
index 35bab67..f520743 100644
--- a/parseinstrs.py
+++ b/parseinstrs.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 
 import argparse
+import bisect
 from collections import OrderedDict, defaultdict, namedtuple, Counter
 from enum import Enum
 from itertools import product
@@ -515,16 +516,17 @@ def superstring(strs):
     # Greedy heuristic generally yields acceptable results, though it depends on
     # the order of the menmonics. More compact results are possible, but the
     # expectable gains of an optimal result (probably with O(n!)) are small.
+    # First sort strings and later do a binary search for each possible prefix.
+    realstrs.sort()
     merged = ""
-    def maxoverlap(s1, s2):
-        for i in range(min(len(s1), len(s2))-1, 0, -1):
-            if s1[:i] == s2[-i:]:
-                return i
-        return 0
     while realstrs:
-        s = max(realstrs, key=lambda k: maxoverlap(k, merged))
-        merged += s[maxoverlap(s, merged):]
-        realstrs.remove(s)
+        for i in range(min(16, len(merged)), 0, -1):
+            idx = bisect.bisect_left(realstrs, merged[-i:])
+            if idx < len(realstrs) and realstrs[idx][:i] == merged[-i:]:
+                merged += realstrs.pop(idx)[i:]
+                break
+        else:
+            merged += realstrs.pop()
     return merged
 
 def decode_table(entries, args):
@@ -570,7 +572,7 @@ def decode_table(entries, args):
                         .replace("REP_", "REP ").replace("CMPXCHGD", "CMPXCHG")
                         .replace("JCXZ", "JCXZ JECXZJRCXZ")
                         .replace("C_SEP", "CWD CDQ CQO")
-                        .replace("C_EX", "CBW CWDECDQE")
+                        .replace("C_EX", "CBW CWDECDQE").replace("XCHG_NOP", "")
                         .lower() for m in mnems]
     mnemonics_str = superstring(mnemonics_intel)