Add encodings for i8 and i16 copy, spill, fill, ireduce.i8.i16 (#534)

* Add encodings for i8 and i16 copy, spill, fill, ireduce.i8.i16 Also adds legalization for srem, irsub_imm, {u,s}extend.i16.i8 Fixes #477 cc #466 * Legalize popcnt, clz and ctz for i8 and i16 * Fix bug in call_memset
2018-10-03 23:43:59 +02:00
parent ddf8fd23b5
commit b2a28d69e6
8 changed files with 139 additions and 12 deletions
--- a/lib/codegen/meta-python/base/legalize.py
+++ b/lib/codegen/meta-python/base/legalize.py
@@ -248,12 +248,12 @@ def widen_imm(signed, op):
                ))


+# int ops
 for binop in [iadd, isub, imul, udiv, urem]:
    widen_two_arg(False, binop)

-widen_two_arg(True, sdiv)
-
-widen_one_arg(False, bnot)
+for binop in [sdiv, srem]:
+    widen_two_arg(True, binop)

 for binop in [iadd_imm, imul_imm, udiv_imm, urem_imm]:
    widen_imm(False, binop)
@@ -261,13 +261,50 @@ for binop in [iadd_imm, imul_imm, udiv_imm, urem_imm]:
 for binop in [sdiv_imm, srem_imm]:
    widen_imm(True, binop)

+widen_imm(False, irsub_imm)
+
 # bit ops
+widen_one_arg(False, bnot)
+
 for binop in [band, bor, bxor, band_not, bor_not, bxor_not]:
    widen_two_arg(False, binop)

 for binop in [band_imm, bor_imm, bxor_imm]:
    widen_imm(False, binop)

+widen_one_arg(False, insts.popcnt)
+
+for (int_ty, num) in [(types.i8, 24), (types.i16, 16)]:
+    widen.legalize(
+        a << insts.clz.bind(int_ty)(b),
+        Rtl(
+            c << uextend.i32(b),
+            d << insts.clz.i32(c),
+            e << iadd_imm(d, imm64(-num)),
+            a << ireduce.bind(int_ty)(e)
+        ))
+
+    widen.legalize(
+        a << insts.cls.bind(int_ty)(b),
+        Rtl(
+            c << sextend.i32(b),
+            d << insts.cls.i32(c),
+            e << iadd_imm(d, imm64(-num)),
+            a << ireduce.bind(int_ty)(e)
+        ))
+
+for (int_ty, num) in [(types.i8, 1 << 8), (types.i16, 1 << 16)]:
+    widen.legalize(
+        a << insts.ctz.bind(int_ty)(b),
+        Rtl(
+            c << uextend.i32(b),
+            # When `b` is zero, returns the size of x in bits.
+            d << bor_imm(c, imm64(num)),
+            e << insts.ctz.i32(d),
+            a << ireduce.bind(int_ty)(e)
+        ))
+
+# iconst
 for int_ty in [types.i8, types.i16]:
    widen.legalize(
        a << iconst.bind(int_ty)(b),
@@ -276,6 +313,21 @@ for int_ty in [types.i8, types.i16]:
            a << ireduce.bind(int_ty)(c)
        ))

+widen.legalize(
+    a << uextend.i16.i8(b),
+    Rtl(
+        c << uextend.i32(b),
+        a << ireduce(c)
+    ))
+
+widen.legalize(
+    a << sextend.i16.i8(b),
+    Rtl(
+        c << sextend.i32(b),
+        a << ireduce(c)
+    ))
+
+
 widen.legalize(
    store.i8(flags, a, ptr, offset),
    Rtl(
--- a/lib/codegen/meta-python/isa/x86/encodings.py
+++ b/lib/codegen/meta-python/isa/x86/encodings.py
@@ -173,7 +173,8 @@ enc_i32_i64(x86.smulx, r.mulx, 0xf7, rrr=5)
 enc_i32_i64(x86.umulx, r.mulx, 0xf7, rrr=4)

 enc_i32_i64(base.copy, r.umr, 0x89)
-enc_both(base.copy.b1, r.umr, 0x89)
+for ty in [types.b1, types.i8, types.i16]:
+    enc_both(base.copy.bind(ty), r.umr, 0x89)

 # For x86-64, only define REX forms for now, since we can't describe the
 # special regunit immediate operands with the current constraint language.
@@ -301,11 +302,12 @@ for recipe in [r.st_abcd, r.stDisp8_abcd, r.stDisp32_abcd]:
 enc_i32_i64(base.spill, r.spillSib32, 0x89)
 enc_i32_i64(base.regspill, r.regspill32, 0x89)

-# Use a 32-bit write for spilling `b1` to avoid constraining the permitted
-# registers.
+# Use a 32-bit write for spilling `b1`, `i8` and `i16` to avoid
+# constraining the permitted registers.
 # See MIN_SPILL_SLOT_SIZE which makes this safe.
-enc_both(base.spill.b1, r.spillSib32, 0x89)
-enc_both(base.regspill.b1, r.regspill32, 0x89)
+for ty in [types.b1, types.i8, types.i16]:
+    enc_both(base.spill.bind(ty), r.spillSib32, 0x89)
+    enc_both(base.regspill.bind(ty), r.regspill32, 0x89)

 for recipe in [r.ld, r.ldDisp8, r.ldDisp32]:
    enc_i32_i64_ld_st(base.load, True, recipe, 0x8b)
@@ -319,9 +321,10 @@ for recipe in [r.ld, r.ldDisp8, r.ldDisp32]:
 enc_i32_i64(base.fill, r.fillSib32, 0x8b)
 enc_i32_i64(base.regfill, r.regfill32, 0x8b)

-# Load 32 bits from `b1` spill slots. See `spill.b1` above.
-enc_both(base.fill.b1, r.fillSib32, 0x8b)
-enc_both(base.regfill.b1, r.regfill32, 0x8b)
+# Load 32 bits from `b1`, `i8` and `i16` spill slots. See `spill.b1` above.
+for ty in [types.b1, types.i8, types.i16]:
+    enc_both(base.fill.bind(ty), r.fillSib32, 0x8b)
+    enc_both(base.regfill.bind(ty), r.regfill32, 0x8b)

 # Push and Pop
 X86_32.enc(x86.push.i32, *r.pushq(0x50))
@@ -578,8 +581,11 @@ X86_64.enc(base.bint.i32.b1, *r.urm_noflags_abcd(0x0f, 0xb6))
 # Numerical conversions.

 # Reducing an integer is a no-op.
+X86_32.enc(base.ireduce.i8.i16, r.null, 0)
 X86_32.enc(base.ireduce.i8.i32, r.null, 0)
 X86_32.enc(base.ireduce.i16.i32, r.null, 0)
+
+X86_64.enc(base.ireduce.i8.i16, r.null, 0)
 X86_64.enc(base.ireduce.i8.i32, r.null, 0)
 X86_64.enc(base.ireduce.i16.i32, r.null, 0)
 X86_64.enc(base.ireduce.i8.i64, r.null, 0)