x64 backend: implement 128-bit ops and misc fixes.

This implements all of the ops on I128 that are implemented by the
legacy x86 backend, and includes all that are required by at least one
major use-case (cg_clif rustc backend).

The sequences are open-coded where necessary; for e.g. the bit
operations, this can be somewhat complex, but these sequences have been
tested carefully. This PR also includes a drive-by fix of clz/ctz for 8-
and 16-bit cases where they were incorrect previously.

Also includes ridealong fixes developed while bringing up cg_clif
support, because they are difficult to completely separate due to
other refactors that occurred in this PR:

- fix REX prefix logic for some 8-bit instructions.

  When using an 8-bit register in 64-bit mode on x86-64, the REX prefix
  semantics are somewhat subtle: without the REX prefix, register numbers
  4--7 correspond to the second-to-lowest byte of the first four registers
  (AH, CH, BH, DH), whereas with the REX prefix, these register numbers
  correspond to the usual encoding (SPL, BPL, SIL, DIL). We could always
  emit a REX byte for instructions with 8-bit cases (this is harmless even
  if unneeded), but this would unnecessarily inflate code size; instead,
  the usual approach is to emit it only for these registers.

  This logic was present in some cases but missing for some other
  instructions: divide, not, negate, shifts.

  Fixes #2508.

- avoid unaligned SSE loads on some f64 ops.

  The implementations of several FP ops, such as fabs/fneg, used SSE
  instructions. This is not a problem per-se, except that load-op merging
  did not take *alignment* into account. Specifically, if an op on an f64
  loaded from memory happened to merge that load, and the instruction into
  which it was merged was an SSE instruction, then the SSE instruction
  imposes stricter (128-bit) alignment requirements than the load.f64 did.

  This PR simply forces any instruction lowerings that could use SSE
  instructions to implement non-SIMD operations to take inputs in
  registers only, and avoid load-op merging.

  Fixes #2507.

- two bugfixes exposed by cg_clif: urem/srem.i8, select.b1.

  - urem/srem.i8: the 8-bit form of the DIV instruction on x86-64 places
    the remainder in AH, not RDX, different from all the other width-forms
    of this instruction.

  - select.b1: we were not recognizing selects of boolean values as
    integer-typed operations, so we were generating XMM moves instead (!).
This commit is contained in:
Chris Fallin
2020-12-12 22:21:39 -08:00
parent 705af0ac41
commit 71ead6e31d
12 changed files with 3213 additions and 675 deletions

View File

@@ -0,0 +1,27 @@
test run
target x86_64
feature "experimental_x64"
function %ctz(i64, i64) -> i8 {
block0(v0: i64, v1: i64):
v2 = iconcat v0, v1
v3 = ctz.i128 v2
v4 = ireduce.i8 v3
return v4
}
; run: %ctz(0x00000000_00000000, 0x00000001_00000000) == 96
; run: %ctz(0x00000000_00010000, 0x00000001_00000000) == 16
; run: %ctz(0x00000000_00010000, 0x00000000_00000000) == 16
; run: %ctz(0x00000000_00000000, 0x00000000_00000000) == 128
function %clz(i64, i64) -> i8 {
block0(v0: i64, v1: i64):
v2 = iconcat v0, v1
v3 = clz.i128 v2
v4 = ireduce.i8 v3
return v4
}
; run: %clz(0x00000000_00000000, 0x00000001_00000000) == 31
; run: %clz(0x00000000_00010000, 0x00000001_00000000) == 31
; run: %clz(0x00000000_00010000, 0x00000000_00000000) == 111
; run: %clz(0x00000000_00000000, 0x00000000_00000000) == 128

View File

@@ -0,0 +1,47 @@
test run
target x86_64
feature "experimental_x64"
function %reverse_bits_zero() -> b1 {
block0:
v0 = iconst.i64 0
v1 = iconcat v0, v0
v2 = bitrev.i128 v1
v3 = icmp eq v2, v1
return v3
}
; run
function %reverse_bits_one() -> b1 {
block0:
v0 = iconst.i64 0
v1 = iconst.i64 1
v2 = iconcat v0, v1
v3 = bitrev.i128 v2
v4 = iconst.i64 0x8000_0000_0000_0000
v5 = iconst.i64 0
v6 = iconcat v4, v5
v7 = icmp eq v3, v6
return v7
}
; run
function %reverse_bits() -> b1 {
block0:
v0 = iconst.i64 0x06AD_8667_69EC_41BA
v1 = iconst.i64 0x6C83_D81A_6E28_83AB
v2 = iconcat v0, v1
v3 = bitrev.i128 v2
v4 = iconst.i64 0xD5C11476581BC136
v5 = iconst.i64 0x5D823796E661B560
v6 = iconcat v4, v5
v7 = icmp eq v3, v6
return v7
}
; run

View File

@@ -0,0 +1,26 @@
test compile
target x86_64
feature "experimental_x64"
function %f(f64) -> f64 {
block0(v0: f64):
v1 = fabs.f64 v0
return v1
}
; check: movabsq $$9223372036854775807, %rsi
; nextln: movq %rsi, %xmm1
; nextln: andpd %xmm0, %xmm1
; nextln: movaps %xmm1, %xmm0
function %f(i64) -> f64 {
block0(v0: i64):
v1 = load.f64 v0
v2 = fabs.f64 v1
return v2
}
; check: movsd 0(%rdi), %xmm0
; nextln: movabsq $$9223372036854775807, %rsi
; nextln: movq %rsi, %xmm1
; nextln: andpd %xmm0, %xmm1
; nextln: movaps %xmm1, %xmm0

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,29 @@
test compile
target x86_64
feature "experimental_x64"
function %f0(i32, i128, i128) -> i128 {
; check: pushq %rbp
; nextln: movq %rsp, %rbp
block0(v0: i32, v1: i128, v2: i128):
v3 = iconst.i32 42
v4 = icmp.i32 eq v0, v3
; nextln: movl $$42, %eax
; nextln: cmpl %eax, %edi
v5 = select.i128 v4, v1, v2
; nextln: cmovzq %rsi, %rcx
; nextln: cmovzq %rdx, %r8
return v5
; nextln: movq %rcx, %rax
; nextln: movq %r8, %rdx
; nextln: movq %rbp, %rsp
; nextln: popq %rbp
; nextln: ret
}

View File

@@ -0,0 +1,106 @@
test run
target x86_64
feature "experimental_x64"
function %ishl1() -> b1 {
block0:
v0 = iconst.i64 0x01010101_01010101
v1 = iconcat v0, v0
v2 = iconst.i32 2
v3 = ishl.i128 v1, v2
v4 = iconst.i64 0x04040404_04040404
v5 = iconcat v4, v4
v6 = icmp eq v3, v5
return v6
}
; run
function %ishl2() -> b1 {
block0:
v0 = iconst.i64 0x01010101_01010101
v1 = iconst.i64 0x01010101_01010101
v2 = iconcat v0, v1
v3 = iconst.i32 9
v4 = ishl.i128 v2, v3
v5 = iconst.i64 0x02020202_02020200
v6 = iconst.i64 0x02020202_02020202
v7 = iconcat v5, v6
v8 = icmp eq v4, v7
return v8
}
; run
function %ishl3() -> b1 {
block0:
v0 = iconst.i64 0x01010101_01010101
v1 = iconst.i64 0xffffffff_ffffffff
v2 = iconcat v0, v1
v3 = iconst.i32 66
v4 = ishl.i128 v2, v3
v5 = iconst.i64 0x00000000_00000000
v6 = iconst.i64 0x04040404_04040404
v7 = iconcat v5, v6
v8 = icmp eq v4, v7
return v8
}
; run
function %ushr1() -> b1 {
block0:
v0 = iconst.i64 0x01010101_01010101
v1 = iconst.i64 0x01010101_01010101
v2 = iconcat v0, v1
v3 = iconst.i32 2
v4 = ushr.i128 v2, v3
v5 = iconst.i64 0x40404040_40404040
v6 = iconst.i64 0x00404040_40404040
v7 = iconcat v5, v6
v8 = icmp eq v4, v7
return v8
}
; run
function %ushr2() -> b1 {
block0:
v0 = iconst.i64 0x01010101_01010101
v1 = iconst.i64 0x01010101_01010101
v2 = iconcat v0, v1
v3 = iconst.i32 66
v4 = ushr.i128 v2, v3
v5 = iconst.i64 0x00404040_40404040
v6 = iconst.i64 0x00000000_00000000
v7 = iconcat v5, v6
v8 = icmp eq v4, v7
return v8
}
; run
function %sshr1() -> b1 {
block0:
v0 = iconst.i64 0x01010101_01010101
v1 = iconst.i64 0x81010101_01010101
v2 = iconcat v0, v1
v3 = iconst.i32 2
v4 = sshr.i128 v2, v3
v5 = iconst.i64 0x40404040_40404040
v6 = iconst.i64 0xe0404040_40404040
v7 = iconcat v5, v6
v8 = icmp eq v4, v7
return v8
}
; run
function %sshr2() -> b1 {
block0:
v0 = iconst.i64 0x12345678_9abcdef0
v1 = iconst.i64 0x80101010_10101010
v2 = iconcat v0, v1
v3 = iconst.i32 66
v4 = sshr.i128 v2, v3
v5 = iconst.i64 0xe0040404_04040404
v6 = iconst.i64 0xffffffff_ffffffff
v7 = iconcat v5, v6
v8 = icmp eq v4, v7
return v8
}
; run