diff --git a/cranelift/filetests/isa/intel/binary64.cton b/cranelift/filetests/isa/intel/binary64.cton index 0b546961d8..b579b4d58d 100644 --- a/cranelift/filetests/isa/intel/binary64.cton +++ b/cranelift/filetests/isa/intel/binary64.cton @@ -336,6 +336,28 @@ ebb0: ; asm: divq %r10 [-,%rax,%rdx] v202, v203 = x86_udivmodx v190, v191, v3 ; bin: 49 f7 f2 + ; double-length multiply instructions, 64 bit + [-,%rax] v1001 = iconst.i64 1 + [-,%r15] v1002 = iconst.i64 2 + ; asm: mulq %r15 + [-,%rax,%rdx] v1003, v1004 = x86_umulx v1001, v1002 ; bin: 49 f7 e7 + ; asm: imulq %r15 + [-,%rax,%rdx] v1005, v1006 = x86_smulx v1001, v1002 ; bin: 49 f7 ef + + ; double-length multiply instructions, 32 bit + [-,%rax] v1011 = iconst.i32 1 + [-,%r15] v1012 = iconst.i32 2 + [-,%rcx] v1017 = iconst.i32 3 + ; asm: mull %r15d + [-,%rax,%rdx] v1013, v1014 = x86_umulx v1011, v1012 ; bin: 41 f7 e7 + ; asm: imull %r15d + [-,%rax,%rdx] v1015, v1016 = x86_smulx v1011, v1012 ; bin: 41 f7 ef + + ; asm: mull %ecx + [-,%rax,%rdx] v1018, v1019 = x86_umulx v1011, v1017 ; bin: f7 e1 + ; asm: imull %ecx + [-,%rax,%rdx] v1020, v1021 = x86_smulx v1011, v1017 ; bin: f7 e9 + ; Bit-counting instructions. ; asm: popcntq %rsi, %rcx diff --git a/cranelift/filetests/isa/intel/legalize-mulhi.cton b/cranelift/filetests/isa/intel/legalize-mulhi.cton new file mode 100644 index 0000000000..673a19db3b --- /dev/null +++ b/cranelift/filetests/isa/intel/legalize-mulhi.cton @@ -0,0 +1,45 @@ + +test compile +set is_64bit +isa intel baseline + +; umulhi/smulhi on 64 bit operands + +function %i64_umulhi(i64, i64) -> i64 { +ebb0(v10: i64, v11: i64): + v12 = umulhi v10, v11 + ; check: %rdi -> %rax + ; check: x86_umulx + ; check: %rdx -> %rax + return v12 +} + +function %i64_smulhi(i64, i64) -> i64 { +ebb0(v20: i64, v21: i64): + v22 = smulhi v20, v21 + ; check: %rdi -> %rax + ; check: x86_smulx + ; check: %rdx -> %rax + return v22 +} + + +; umulhi/smulhi on 32 bit operands + +function %i32_umulhi(i32, i32) -> i32 { +ebb0(v30: i32, v31: i32): + v32 = umulhi v30, v31 + ; check: %rdi -> %rax + ; check: x86_umulx + ; check: %rdx -> %rax + return v32 +} + +function %i32_smulhi(i32, i32) -> i32 { +ebb0(v40: i32, v41: i32): + v42 = smulhi v40, v41 + ; check: %rdi -> %rax + ; check: x86_smulx + ; check: %rdx -> %rax + return v42 +} diff --git a/cranelift/filetests/preopt/div_by_const_indirect.cton b/cranelift/filetests/preopt/div_by_const_indirect.cton new file mode 100644 index 0000000000..ccc83cd49b --- /dev/null +++ b/cranelift/filetests/preopt/div_by_const_indirect.cton @@ -0,0 +1,60 @@ + +test preopt +isa intel baseline + +; Cases where the denominator is created by an iconst + +function %indir_udiv32(i32) -> i32 { +ebb0(v0: i32): + v1 = iconst.i32 7 + v2 = udiv v0, v1 + ; check: iconst.i32 7 + ; check: iconst.i32 0x2492_4925 + ; check: umulhi v0, v3 + ; check: isub v0, v4 + ; check: ushr_imm v5, 1 + ; check: iadd v6, v4 + ; check: ushr_imm v7, 2 + ; check: copy v8 + return v2 +} + +function %indir_sdiv32(i32) -> i32 { +ebb0(v0: i32): + v1 = iconst.i32 -17 + v2 = sdiv v0, v1 + ; check: iconst.i32 -17 + ; check: iconst.i32 0xffff_ffff_8787_8787 + ; check: smulhi v0, v3 + ; check: sshr_imm v4, 3 + ; check: ushr_imm v5, 31 + ; check: iadd v5, v6 + ; check: copy v7 + return v2 +} + +function %indir_udiv64(i64) -> i64 { +ebb0(v0: i64): + v1 = iconst.i64 1337 + v2 = udiv v0, v1 + ; check: iconst.i64 1337 + ; check: iconst.i64 0xc411_9d95_2866_a139 + ; check: umulhi v0, v3 + ; check: ushr_imm v4, 10 + ; check: copy v5 + return v2 +} + +function %indir_sdiv64(i64) -> i64 { +ebb0(v0: i64): + v1 = iconst.i64 -90210 + v2 = sdiv v0, v1 + ; check: iconst.i64 0xffff_ffff_fffe_9f9e + ; check: iconst.i64 0xd181_4ee8_939c_b8bb + ; check: smulhi v0, v3 + ; check: sshr_imm v4, 14 + ; check: ushr_imm v5, 63 + ; check: iadd v5, v6 + ; check: copy v7 + return v2 +} diff --git a/cranelift/filetests/preopt/div_by_const_non_power_of_2.cton b/cranelift/filetests/preopt/div_by_const_non_power_of_2.cton new file mode 100644 index 0000000000..18811fcd82 --- /dev/null +++ b/cranelift/filetests/preopt/div_by_const_non_power_of_2.cton @@ -0,0 +1,267 @@ + +test preopt +isa intel baseline + +; -------- U32 -------- + +; complex case (mul, sub, shift, add, shift) +function %t_udiv32_p7(i32) -> i32 { +ebb0(v0: i32): + v1 = udiv_imm v0, 7 + ; check: iconst.i32 0x2492_4925 + ; check: umulhi v0, v2 + ; check: isub v0, v3 + ; check: ushr_imm v4, 1 + ; check: iadd v5, v3 + ; check: ushr_imm v6, 2 + ; check: copy v7 + return v1 +} + +; simple case (mul, shift) +function %t_udiv32_p125(i32) -> i32 { +ebb0(v0: i32): + v1 = udiv_imm v0, 125 + ; check: iconst.i32 0x1062_4dd3 + ; check: umulhi v0, v2 + ; check: ushr_imm v3, 3 + ; check: copy v4 + return v1 +} + +; simple case w/ shift by zero (mul) +function %t_udiv32_p641(i32) -> i32 { +ebb0(v0: i32): + v1 = udiv_imm v0, 641 + ; check: iconst.i32 0x0066_3d81 + ; check: umulhi v0, v2 + ; check: copy v3 + return v1 +} + + +; -------- S32 -------- + +; simple case w/ shift by zero (mul, add-sign-bit) +function %t_sdiv32_n6(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, -6 + ; check: iconst.i32 0xffff_ffff_d555_5555 + ; check: smulhi v0, v2 + ; check: ushr_imm v3, 31 + ; check: iadd v3, v4 + ; check: copy v5 + return v1 +} + +; simple case (mul, shift, add-sign-bit) +function %t_sdiv32_n5(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, -5 + ; check: iconst.i32 0xffff_ffff_9999_9999 + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 1 + ; check: ushr_imm v4, 31 + ; check: iadd v4, v5 + ; check: copy v6 + return v1 +} + +; case d < 0 && M > 0 (mul, sub, shift, add-sign-bit) +function %t_sdiv32_n3(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, -3 + ; check: iconst.i32 0x5555_5555 + ; check: smulhi v0, v2 + ; check: isub v3, v0 + ; check: sshr_imm v4, 1 + ; check: ushr_imm v5, 31 + ; check: iadd v5, v6 + ; check: copy v7 + return v1 +} + +; simple case w/ shift by zero (mul, add-sign-bit) +function %t_sdiv32_p6(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, 6 + ; check: iconst.i32 0x2aaa_aaab + ; check: smulhi v0, v2 + ; check: ushr_imm v3, 31 + ; check: iadd v3, v4 + ; check: copy v5 + return v1 +} + +; case d > 0 && M < 0 (mull, add, shift, add-sign-bit) +function %t_sdiv32_p7(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, 7 + ; check: iconst.i32 0xffff_ffff_9249_2493 + ; check: smulhi v0, v2 + ; check: iadd v3, v0 + ; check: sshr_imm v4, 2 + ; check: ushr_imm v5, 31 + ; check: iadd v5, v6 + ; check: copy v7 + return v1 +} + +; simple case (mul, shift, add-sign-bit) +function %t_sdiv32_p625(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, 625 + ; check: iconst.i32 0x68db_8bad + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 8 + ; check: ushr_imm v4, 31 + ; check: iadd v4, v5 + ; check: copy v6 + return v1 +} + + +; -------- U64 -------- + +; complex case (mul, sub, shift, add, shift) +function %t_udiv64_p7(i64) -> i64 { +ebb0(v0: i64): + v1 = udiv_imm v0, 7 + ; check: iconst.i64 0x2492_4924_9249_2493 + ; check: umulhi v0, v2 + ; check: isub v0, v3 + ; check: ushr_imm v4, 1 + ; check: iadd v5, v3 + ; check: ushr_imm v6, 2 + ; check: copy v7 + return v1 +} + +; simple case (mul, shift) +function %t_udiv64_p9(i64) -> i64 { +ebb0(v0: i64): + v1 = udiv_imm v0, 9 + ; check: iconst.i64 0xe38e_38e3_8e38_e38f + ; check: umulhi v0, v2 + ; check: ushr_imm v3, 3 + ; check: copy v4 + return v1 +} + +; complex case (mul, sub, shift, add, shift) +function %t_udiv64_p125(i64) -> i64 { +ebb0(v0: i64): + v1 = udiv_imm v0, 125 + ; check: iconst.i64 0x0624_dd2f_1a9f_be77 + ; check: umulhi v0, v2 + ; check: isub v0, v3 + ; check: ushr_imm v4, 1 + ; check: iadd v5, v3 + ; check: ushr_imm v6, 6 + ; check: copy v7 + return v1 +} + +; simple case w/ shift by zero (mul) +function %t_udiv64_p274177(i64) -> i64 { +ebb0(v0: i64): + v1 = udiv_imm v0, 274177 + ; check: iconst.i64 0x3d30_f19c_d101 + ; check: umulhi v0, v2 + ; check: copy v3 + return v1 +} + + +; -------- S64 -------- + +; simple case (mul, shift, add-sign-bit) +function %t_sdiv64_n625(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, -625 + ; check: iconst.i64 0xcb92_3a29_c779_a6b5 + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 7 + ; check: ushr_imm v4, 63 + ; check: iadd v4, v5 + ; check: copy v6 + return v1 +} + +; simple case w/ zero shift (mul, add-sign-bit) +function %t_sdiv64_n6(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, -6 + ; check: iconst.i64 0xd555_5555_5555_5555 + ; check: smulhi v0, v2 + ; check: ushr_imm v3, 63 + ; check: iadd v3, v4 + ; check: copy v5 + return v1 +} + +; simple case w/ zero shift (mul, add-sign-bit) +function %t_sdiv64_n5(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, -5 + ; check: iconst.i64 0x9999_9999_9999_9999 + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 1 + ; check: ushr_imm v4, 63 + ; check: iadd v4, v5 + ; check: copy v6 + return v1 +} + +; case d < 0 && M > 0 (mul, sub, shift, add-sign-bit) +function %t_sdiv64_n3(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, -3 + ; check: iconst.i64 0x5555_5555_5555_5555 + ; check: smulhi v0, v2 + ; check: isub v3, v0 + ; check: sshr_imm v4, 1 + ; check: ushr_imm v5, 63 + ; check: iadd v5, v6 + ; check: copy v7 + return v1 +} + +; simple case w/ zero shift (mul, add-sign-bit) +function %t_sdiv64_p6(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, 6 + ; check: iconst.i64 0x2aaa_aaaa_aaaa_aaab + ; check: smulhi v0, v2 + ; check: ushr_imm v3, 63 + ; check: iadd v3, v4 + ; check: copy v5 + return v1 +} + +; case d > 0 && M < 0 (mul, add, shift, add-sign-bit) +function %t_sdiv64_p15(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, 15 + ; check: iconst.i64 0x8888_8888_8888_8889 + ; check: smulhi v0, v2 + ; check: iadd v3, v0 + ; check: sshr_imm v4, 3 + ; check: ushr_imm v5, 63 + ; check: iadd v5, v6 + ; check: copy v7 + return v1 +} + +; simple case (mul, shift, add-sign-bit) +function %t_sdiv64_p625(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, 625 + ; check: iconst.i64 0x346d_c5d6_3886_594b + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 7 + ; check: ushr_imm v4, 63 + ; check: iadd v4, v5 + ; check: copy v6 + return v1 +} diff --git a/cranelift/filetests/preopt/div_by_const_power_of_2.cton b/cranelift/filetests/preopt/div_by_const_power_of_2.cton new file mode 100644 index 0000000000..dc51c5395d --- /dev/null +++ b/cranelift/filetests/preopt/div_by_const_power_of_2.cton @@ -0,0 +1,293 @@ + +test preopt +isa intel baseline + +; -------- U32 -------- + +; ignored +function %t_udiv32_p0(i32) -> i32 { +ebb0(v0: i32): + v1 = udiv_imm v0, 0 + ; check: udiv_imm v0, 0 + return v1 +} + +; converted to a copy +function %t_udiv32_p1(i32) -> i32 { +ebb0(v0: i32): + v1 = udiv_imm v0, 1 + ; check: copy v0 + return v1 +} + +; shift +function %t_udiv32_p2(i32) -> i32 { +ebb0(v0: i32): + v1 = udiv_imm v0, 2 + ; check: ushr_imm v0, 1 + return v1 +} + +; shift +function %t_udiv32_p2p31(i32) -> i32 { +ebb0(v0: i32): + v1 = udiv_imm v0, 0x8000_0000 + ; check: ushr_imm v0, 31 + return v1 +} + + +; -------- U64 -------- + +; ignored +function %t_udiv64_p0(i64) -> i64 { +ebb0(v0: i64): + v1 = udiv_imm v0, 0 + ; check: udiv_imm v0, 0 + return v1 +} + +; converted to a copy +function %t_udiv64_p1(i64) -> i64 { +ebb0(v0: i64): + v1 = udiv_imm v0, 1 + ; check: copy v0 + return v1 +} + +; shift +function %t_udiv64_p2(i64) -> i64 { +ebb0(v0: i64): + v1 = udiv_imm v0, 2 + ; check: ushr_imm v0, 1 + return v1 +} + +; shift +function %t_udiv64_p2p63(i64) -> i64 { +ebb0(v0: i64): + v1 = udiv_imm v0, 0x8000_0000_0000_0000 + ; check: ushr_imm v0, 63 + return v1 +} + + +; -------- S32 -------- + +; ignored +function %t_sdiv32_p0(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, 0 + ; check: sdiv_imm v0, 0 + return v1 +} + +; converted to a copy +function %t_sdiv32_p1(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, 1 + ; check: copy v0 + return v1 +} + +; ignored +function %t_sdiv32_n1(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, -1 + ; check: sdiv_imm v0, -1 + return v1 +} + +; shift +function %t_sdiv32_p2(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, 2 + ; check: ushr_imm v0, 31 + ; check: iadd v0, v2 + ; check: sshr_imm v3, 1 + ; check: copy v4 + return v1 +} + +; shift +function %t_sdiv32_n2(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, -2 + ; check: ushr_imm v0, 31 + ; check: iadd v0, v2 + ; check: sshr_imm v3, 1 + ; check: irsub_imm v4, 0 + return v1 +} + +; shift +function %t_sdiv32_p4(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, 4 + ; check: v2 = sshr_imm v0, 1 + ; check: ushr_imm v2, 30 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 2 + ; check: copy v5 + + return v1 +} + +; shift +function %t_sdiv32_n4(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, -4 + ; check: sshr_imm v0, 1 + ; check: ushr_imm v2, 30 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 2 + ; check: irsub_imm v5, 0 + return v1 +} + +; shift +function %t_sdiv32_p2p30(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, 0x4000_0000 + ; check: sshr_imm v0, 29 + ; check: ushr_imm v2, 2 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 30 + ; check: copy v5 + return v1 +} + +; shift +function %t_sdiv32_n2p30(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, -0x4000_0000 + ; check: sshr_imm v0, 29 + ; check: ushr_imm v2, 2 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 30 + ; check: irsub_imm v5, 0 + return v1 +} + +; there's no positive version of this, since -(-0x8000_0000) isn't +; representable. +function %t_sdiv32_n2p31(i32) -> i32 { +ebb0(v0: i32): + v1 = sdiv_imm v0, -0x8000_0000 + ; check: sshr_imm v0, 30 + ; check: ushr_imm v2, 1 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 31 + ; check: irsub_imm v5, 0 + return v1 +} + + +; -------- S64 -------- + +; ignored +function %t_sdiv64_p0(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, 0 + ; check: sdiv_imm v0, 0 + return v1 +} + +; converted to a copy +function %t_sdiv64_p1(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, 1 + ; check: copy v0 + return v1 +} + +; ignored +function %t_sdiv64_n1(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, -1 + ; check: sdiv_imm v0, -1 + return v1 +} + +; shift +function %t_sdiv64_p2(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, 2 + ; check: ushr_imm v0, 63 + ; check: iadd v0, v2 + ; check: sshr_imm v3, 1 + ; check: copy v4 + return v1 +} + +; shift +function %t_sdiv64_n2(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, -2 + ; check: ushr_imm v0, 63 + ; check: iadd v0, v2 + ; check: sshr_imm v3, 1 + ; check: irsub_imm v4, 0 + return v1 +} + +; shift +function %t_sdiv64_p4(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, 4 + ; check: sshr_imm v0, 1 + ; check: ushr_imm v2, 62 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 2 + ; check: copy v5 + return v1 +} + +; shift +function %t_sdiv64_n4(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, -4 + ; check: sshr_imm v0, 1 + ; check: ushr_imm v2, 62 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 2 + ; check: irsub_imm v5, 0 + return v1 +} + +; shift +function %t_sdiv64_p2p62(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, 0x4000_0000_0000_0000 + ; check: sshr_imm v0, 61 + ; check: ushr_imm v2, 2 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 62 + ; check: copy v5 + return v1 +} + +; shift +function %t_sdiv64_n2p62(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, -0x4000_0000_0000_0000 + ; check: sshr_imm v0, 61 + ; check: ushr_imm v2, 2 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 62 + ; check: irsub_imm v5, 0 + return v1 +} + +; there's no positive version of this, since -(-0x8000_0000_0000_0000) isn't +; representable. +function %t_sdiv64_n2p63(i64) -> i64 { +ebb0(v0: i64): + v1 = sdiv_imm v0, -0x8000_0000_0000_0000 + ; check: sshr_imm v0, 62 + ; check: ushr_imm v2, 1 + ; check: iadd v0, v3 + ; check: sshr_imm v4, 63 + ; check: irsub_imm v5, 0 + return v1 +} diff --git a/cranelift/filetests/preopt/rem_by_const_non_power_of_2.cton b/cranelift/filetests/preopt/rem_by_const_non_power_of_2.cton new file mode 100644 index 0000000000..c142a16359 --- /dev/null +++ b/cranelift/filetests/preopt/rem_by_const_non_power_of_2.cton @@ -0,0 +1,286 @@ + +test preopt +isa intel baseline + +; -------- U32 -------- + +; complex case (mul, sub, shift, add, shift) +function %t_urem32_p7(i32) -> i32 { +ebb0(v0: i32): + v1 = urem_imm v0, 7 + ; check: iconst.i32 0x2492_4925 + ; check: umulhi v0, v2 + ; check: isub v0, v3 + ; check: ushr_imm v4, 1 + ; check: iadd v5, v3 + ; check: ushr_imm v6, 2 + ; check: imul_imm v7, 7 + ; check: isub v0, v8 + return v1 +} + +; simple case (mul, shift) +function %t_urem32_p125(i32) -> i32 { +ebb0(v0: i32): + v1 = urem_imm v0, 125 + ; check: iconst.i32 0x1062_4dd3 + ; check: umulhi v0, v2 + ; check: ushr_imm v3, 3 + ; check: imul_imm v4, 125 + ; check: isub v0, v5 + return v1 +} + +; simple case w/ shift by zero (mul) +function %t_urem32_p641(i32) -> i32 { +ebb0(v0: i32): + v1 = urem_imm v0, 641 + ; check: iconst.i32 0x0066_3d81 + ; check: umulhi v0, v2 + ; check: imul_imm v3, 641 + ; check: isub v0, v4 + return v1 +} + + +; -------- S32 -------- + +; simple case w/ shift by zero (mul, add-sign-bit) +function %t_srem32_n6(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, -6 + ; check: iconst.i32 0xffff_ffff_d555_5555 + ; check: smulhi v0, v2 + ; check: ushr_imm v3, 31 + ; check: iadd v3, v4 + ; check: imul_imm v5, -6 + ; check: isub v0, v6 + return v1 +} + +; simple case (mul, shift, add-sign-bit) +function %t_srem32_n5(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, -5 + ; check: iconst.i32 0xffff_ffff_9999_9999 + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 1 + ; check: ushr_imm v4, 31 + ; check: iadd v4, v5 + ; check: imul_imm v6, -5 + ; check: isub v0, v7 + return v1 +} + +; case d < 0 && M > 0 (mul, sub, shift, add-sign-bit) +function %t_srem32_n3(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, -3 + ; check: iconst.i32 0x5555_5555 + ; check: smulhi v0, v2 + ; check: isub v3, v0 + ; check: sshr_imm v4, 1 + ; check: ushr_imm v5, 31 + ; check: iadd v5, v6 + ; check: imul_imm v7, -3 + ; check: isub v0, v8 + return v1 +} + +; simple case w/ shift by zero (mul, add-sign-bit) +function %t_srem32_p6(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, 6 + ; check: iconst.i32 0x2aaa_aaab + ; check: smulhi v0, v2 + ; check: ushr_imm v3, 31 + ; check: iadd v3, v4 + ; check: imul_imm v5, 6 + ; check: isub v0, v6 + return v1 +} + +; case d > 0 && M < 0 (mull, add, shift, add-sign-bit) +function %t_srem32_p7(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, 7 + ; check: iconst.i32 0xffff_ffff_9249_2493 + ; check: smulhi v0, v2 + ; check: iadd v3, v0 + ; check: sshr_imm v4, 2 + ; check: ushr_imm v5, 31 + ; check: iadd v5, v6 + ; check: imul_imm v7, 7 + ; check: isub v0, v8 + return v1 +} + +; simple case (mul, shift, add-sign-bit) +function %t_srem32_p625(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, 625 + ; check: iconst.i32 0x68db_8bad + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 8 + ; check: ushr_imm v4, 31 + ; check: iadd v4, v5 + ; check: imul_imm v6, 625 + ; check: isub v0, v7 + return v1 +} + + +; -------- U64 -------- + +; complex case (mul, sub, shift, add, shift) +function %t_urem64_p7(i64) -> i64 { +ebb0(v0: i64): + v1 = urem_imm v0, 7 + ; check: umulhi v0, v2 + ; check: isub v0, v3 + ; check: ushr_imm v4, 1 + ; check: iadd v5, v3 + ; check: ushr_imm v6, 2 + ; check: imul_imm v7, 7 + ; check: isub v0, v8 + return v1 +} + +; simple case (mul, shift) +function %t_urem64_p9(i64) -> i64 { +ebb0(v0: i64): + v1 = urem_imm v0, 9 + ; check: iconst.i64 0xe38e_38e3_8e38_e38f + ; check: umulhi v0, v2 + ; check: ushr_imm v3, 3 + ; check: imul_imm v4, 9 + ; check: isub v0, v5 + return v1 +} + +; complex case (mul, sub, shift, add, shift) +function %t_urem64_p125(i64) -> i64 { +ebb0(v0: i64): + v1 = urem_imm v0, 125 + ; check: iconst.i64 0x0624_dd2f_1a9f_be77 + ; check: umulhi v0, v2 + ; check: isub v0, v3 + ; check: ushr_imm v4, 1 + ; check: iadd v5, v3 + ; check: ushr_imm v6, 6 + ; check: imul_imm v7, 125 + ; check: isub v0, v8 + return v1 +} + +; simple case w/ shift by zero (mul) +function %t_urem64_p274177(i64) -> i64 { +ebb0(v0: i64): + v1 = urem_imm v0, 274177 + ; check: iconst.i64 0x3d30_f19c_d101 + ; check: umulhi v0, v2 + ; check: imul_imm v3, 0x0004_2f01 + ; check: isub v0, v4 + return v1 +} + + +; -------- S64 -------- + +; simple case (mul, shift, add-sign-bit) +function %t_srem64_n625(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, -625 + ; check: iconst.i64 0xcb92_3a29_c779_a6b5 + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 7 + ; check: ushr_imm v4, 63 + ; check: iadd v4, v5 + ; check: imul_imm v6, -625 + ; check: isub v0, v7 + return v1 +} + +; simple case w/ zero shift (mul, add-sign-bit) +function %t_srem64_n6(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, -6 + ; check: iconst.i64 0xd555_5555_5555_5555 + ; check: smulhi v0, v2 + ; check: ushr_imm v3, 63 + ; check: iadd v3, v4 + ; check: imul_imm v5, -6 + ; check: isub v0, v6 + return v1 +} + +; simple case w/ zero shift (mul, add-sign-bit) +function %t_srem64_n5(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, -5 + ; check: iconst.i64 0x9999_9999_9999_9999 + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 1 + ; check: ushr_imm v4, 63 + ; check: iadd v4, v5 + ; check: imul_imm v6, -5 + ; check: isub v0, v7 + return v1 +} + +; case d < 0 && M > 0 (mul, sub, shift, add-sign-bit) +function %t_srem64_n3(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, -3 + ; check: iconst.i64 0x5555_5555_5555_5555 + ; check: smulhi v0, v2 + ; check: isub v3, v0 + ; check: sshr_imm v4, 1 + ; check: ushr_imm v5, 63 + ; check: iadd v5, v6 + ; check: imul_imm v7, -3 + ; check: isub v0, v8 + return v1 +} + +; simple case w/ zero shift (mul, add-sign-bit) +function %t_srem64_p6(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, 6 + ; check: iconst.i64 0x2aaa_aaaa_aaaa_aaab + ; check: smulhi v0, v2 + ; check: ushr_imm v3, 63 + ; check: iadd v3, v4 + ; check: imul_imm v5, 6 + ; check: isub v0, v6 + return v1 +} + +; case d > 0 && M < 0 (mul, add, shift, add-sign-bit) +function %t_srem64_p15(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, 15 + ; check: iconst.i64 0x8888_8888_8888_8889 + ; check: smulhi v0, v2 + ; check: iadd v3, v0 + ; check: sshr_imm v4, 3 + ; check: ushr_imm v5, 63 + ; check: iadd v5, v6 + ; check: imul_imm v7, 15 + ; check: isub v0, v8 + return v1 +} + +; simple case (mul, shift, add-sign-bit) +function %t_srem64_p625(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, 625 + ; check: iconst.i64 0x346d_c5d6_3886_594b + ; check: smulhi v0, v2 + ; check: sshr_imm v3, 7 + ; check: ushr_imm v4, 63 + ; check: iadd v4, v5 + ; check: imul_imm v6, 625 + ; check: isub v0, v7 + return v1 +} diff --git a/cranelift/filetests/preopt/rem_by_const_power_of_2.cton b/cranelift/filetests/preopt/rem_by_const_power_of_2.cton new file mode 100644 index 0000000000..931623d2e7 --- /dev/null +++ b/cranelift/filetests/preopt/rem_by_const_power_of_2.cton @@ -0,0 +1,292 @@ + +test preopt +isa intel baseline + +; -------- U32 -------- + +; ignored +function %t_urem32_p0(i32) -> i32 { +ebb0(v0: i32): + v1 = urem_imm v0, 0 + ; check: urem_imm v0, 0 + return v1 +} + +; converted to constant zero +function %t_urem32_p1(i32) -> i32 { +ebb0(v0: i32): + v1 = urem_imm v0, 1 + ; check: iconst.i32 0 + return v1 +} + +; shift +function %t_urem32_p2(i32) -> i32 { +ebb0(v0: i32): + v1 = urem_imm v0, 2 + ; check: band_imm v0, 1 + return v1 +} + +; shift +function %t_urem32_p2p31(i32) -> i32 { +ebb0(v0: i32): + v1 = urem_imm v0, 0x8000_0000 + ; check: band_imm v0, 0x7fff_ffff + return v1 +} + + +; -------- U64 -------- + +; ignored +function %t_urem64_p0(i64) -> i64 { +ebb0(v0: i64): + v1 = urem_imm v0, 0 + ; check: urem_imm v0, 0 + return v1 +} + +; converted to constant zero +function %t_urem64_p1(i64) -> i64 { +ebb0(v0: i64): + v1 = urem_imm v0, 1 + ; check: iconst.i64 0 + return v1 +} + +; shift +function %t_urem64_p2(i64) -> i64 { +ebb0(v0: i64): + v1 = urem_imm v0, 2 + ; check: band_imm v0, 1 + return v1 +} + +; shift +function %t_urem64_p2p63(i64) -> i64 { +ebb0(v0: i64): + v1 = urem_imm v0, 0x8000_0000_0000_0000 + ; check: band_imm v0, 0x7fff_ffff_ffff_ffff + return v1 +} + + +; -------- S32 -------- + +; ignored +function %t_srem32_n1(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, -1 + ; check: srem_imm v0, -1 + return v1 +} + +; ignored +function %t_srem32_p0(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, 0 + ; check: srem_imm v0, 0 + return v1 +} + +; converted to constant zero +function %t_srem32_p1(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, 1 + ; check: iconst.i32 0 + return v1 +} + +; shift +function %t_srem32_p2(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, 2 + ; check: ushr_imm v0, 31 + ; check: iadd v0, v2 + ; check: band_imm v3, -2 + ; check: isub v0, v4 + return v1 +} + +; shift +function %t_srem32_n2(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, -2 + ; check: ushr_imm v0, 31 + ; check: iadd v0, v2 + ; check: band_imm v3, -2 + ; check: isub v0, v4 + return v1 +} + +; shift +function %t_srem32_p4(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, 4 + ; check: sshr_imm v0, 1 + ; check: ushr_imm v2, 30 + ; check: iadd v0, v3 + ; check: band_imm v4, -4 + ; check: isub v0, v5 + return v1 +} + +; shift +function %t_srem32_n4(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, -4 + ; check: sshr_imm v0, 1 + ; check: ushr_imm v2, 30 + ; check: iadd v0, v3 + ; check: band_imm v4, -4 + ; check: isub v0, v5 + return v1 +} + +; shift +function %t_srem32_p2p30(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, 0x4000_0000 + ; check: sshr_imm v0, 29 + ; check: ushr_imm v2, 2 + ; check: iadd v0, v3 + ; check: band_imm v4, 0xffff_ffff_c000_0000 + ; check: isub v0, v5 + return v1 +} + +; shift +function %t_srem32_n2p30(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, -0x4000_0000 + ; check: sshr_imm v0, 29 + ; check: ushr_imm v2, 2 + ; check: iadd v0, v3 + ; check: band_imm v4, 0xffff_ffff_c000_0000 + ; check: isub v0, v5 + return v1 +} + +; there's no positive version of this, since -(-0x8000_0000) isn't +; representable. +function %t_srem32_n2p31(i32) -> i32 { +ebb0(v0: i32): + v1 = srem_imm v0, -0x8000_0000 + ; check: sshr_imm v0, 30 + ; check: ushr_imm v2, 1 + ; check: iadd v0, v3 + ; check: band_imm v4, 0xffff_ffff_8000_0000 + ; check: isub v0, v5 + return v1 +} + + +; -------- S64 -------- + +; ignored +function %t_srem64_n1(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, -1 + ; check: srem_imm v0, -1 + return v1 +} + +; ignored +function %t_srem64_p0(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, 0 + ; check: srem_imm v0, 0 + return v1 +} + +; converted to constant zero +function %t_srem64_p1(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, 1 + ; check: iconst.i64 0 + return v1 +} + +; shift +function %t_srem64_p2(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, 2 + ; check: ushr_imm v0, 63 + ; check: iadd v0, v2 + ; check: band_imm v3, -2 + ; check: isub v0, v4 + return v1 +} + +; shift +function %t_srem64_n2(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, -2 + ; check: ushr_imm v0, 63 + ; check: iadd v0, v2 + ; check: band_imm v3, -2 + ; check: isub v0, v4 + return v1 +} + +; shift +function %t_srem64_p4(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, 4 + ; check: sshr_imm v0, 1 + ; check: ushr_imm v2, 62 + ; check: iadd v0, v3 + ; check: band_imm v4, -4 + ; check: isub v0, v5 + return v1 +} + +; shift +function %t_srem64_n4(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, -4 + ; check: sshr_imm v0, 1 + ; check: ushr_imm v2, 62 + ; check: iadd v0, v3 + ; check: band_imm v4, -4 + ; check: isub v0, v5 + return v1 +} + +; shift +function %t_srem64_p2p62(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, 0x4000_0000_0000_0000 + ; check: sshr_imm v0, 61 + ; check: ushr_imm v2, 2 + ; check: iadd v0, v3 + ; check: band_imm v4, 0xc000_0000_0000_0000 + ; check: isub v0, v5 + return v1 +} + +; shift +function %t_srem64_n2p62(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, -0x4000_0000_0000_0000 + ; check: sshr_imm v0, 61 + ; check: ushr_imm v2, 2 + ; check: iadd v0, v3 + ; check: band_imm v4, 0xc000_0000_0000_0000 + ; check: isub v0, v5 + return v1 +} + +; there's no positive version of this, since -(-0x8000_0000_0000_0000) isn't +; representable. +function %t_srem64_n2p63(i64) -> i64 { +ebb0(v0: i64): + v1 = srem_imm v0, -0x8000_0000_0000_0000 + ; check: sshr_imm v0, 62 + ; check: ushr_imm v2, 1 + ; check: iadd v0, v3 + ; check: band_imm v4, 0x8000_0000_0000_0000 + ; check: isub v0, v5 + return v1 +} diff --git a/cranelift/src/filetest/mod.rs b/cranelift/src/filetest/mod.rs index 2d2a9c6cac..5286307811 100644 --- a/cranelift/src/filetest/mod.rs +++ b/cranelift/src/filetest/mod.rs @@ -19,6 +19,7 @@ mod concurrent; mod domtree; mod legalizer; mod licm; +mod preopt; mod regalloc; mod runner; mod runone; @@ -64,6 +65,7 @@ fn new_subtest(parsed: &TestCommand) -> subtest::Result> { "domtree" => domtree::subtest(parsed), "legalizer" => legalizer::subtest(parsed), "licm" => licm::subtest(parsed), + "preopt" => preopt::subtest(parsed), "print-cfg" => print_cfg::subtest(parsed), "regalloc" => regalloc::subtest(parsed), "simple-gvn" => simple_gvn::subtest(parsed), diff --git a/cranelift/src/filetest/preopt.rs b/cranelift/src/filetest/preopt.rs new file mode 100644 index 0000000000..60d03f8207 --- /dev/null +++ b/cranelift/src/filetest/preopt.rs @@ -0,0 +1,50 @@ +//! Test command for testing the preopt pass. +//! +//! The resulting function is sent to `filecheck`. + +use cretonne::ir::Function; +use cretonne; +use cton_reader::TestCommand; +use filetest::subtest::{SubTest, Context, Result, run_filecheck}; +use std::borrow::Cow; +use std::fmt::Write; +use utils::pretty_error; + +struct TestPreopt; + +pub fn subtest(parsed: &TestCommand) -> Result> { + assert_eq!(parsed.command, "preopt"); + if !parsed.options.is_empty() { + Err(format!("No options allowed on {}", parsed)) + } else { + Ok(Box::new(TestPreopt)) + } +} + +impl SubTest for TestPreopt { + fn name(&self) -> Cow { + Cow::from("preopt") + } + + fn is_mutating(&self) -> bool { + true + } + + fn run(&self, func: Cow, context: &Context) -> Result<()> { + // Create a compilation context, and drop in the function. + let mut comp_ctx = cretonne::Context::new(); + comp_ctx.func = func.into_owned(); + let isa = context.isa.expect("preopt needs an ISA"); + + comp_ctx.flowgraph(); + comp_ctx.preopt(isa).map_err(|e| { + pretty_error(&comp_ctx.func, context.isa, Into::into(e)) + })?; + + let mut text = String::new(); + write!(&mut text, "{}", &comp_ctx.func).map_err( + |e| e.to_string(), + )?; + run_filecheck(&text, context) + } +} diff --git a/lib/cretonne/meta/base/instructions.py b/lib/cretonne/meta/base/instructions.py index 7200e45e7d..7db8bed202 100644 --- a/lib/cretonne/meta/base/instructions.py +++ b/lib/cretonne/meta/base/instructions.py @@ -833,6 +833,26 @@ imul = Instruction( """, ins=(x, y), outs=a) +umulhi = Instruction( + 'umulhi', r""" + Unsigned integer multiplication, producing the high half of a + double-length result. + + Polymorphic over all scalar integer types, but does not support vector + types. + """, + ins=(x, y), outs=a) + +smulhi = Instruction( + 'smulhi', """ + Signed integer multiplication, producing the high half of a + double-length result. + + Polymorphic over all scalar integer types, but does not support vector + types. + """, + ins=(x, y), outs=a) + udiv = Instruction( 'udiv', r""" Unsigned integer division: :math:`a := \lfloor {x \over y} \rfloor`. diff --git a/lib/cretonne/meta/isa/intel/encodings.py b/lib/cretonne/meta/isa/intel/encodings.py index 1f47a5da06..f9b77a04bd 100644 --- a/lib/cretonne/meta/isa/intel/encodings.py +++ b/lib/cretonne/meta/isa/intel/encodings.py @@ -120,6 +120,9 @@ enc_i32_i64(base.imul, r.rrx, 0x0f, 0xaf) enc_i32_i64(x86.sdivmodx, r.div, 0xf7, rrr=7) enc_i32_i64(x86.udivmodx, r.div, 0xf7, rrr=6) +enc_i32_i64(x86.smulx, r.mulx, 0xf7, rrr=5) +enc_i32_i64(x86.umulx, r.mulx, 0xf7, rrr=4) + enc_i32_i64(base.copy, r.umr, 0x89) enc_both(base.copy.b1, r.umr, 0x89) enc_i32_i64(base.regmove, r.rmov, 0x89) diff --git a/lib/cretonne/meta/isa/intel/instructions.py b/lib/cretonne/meta/isa/intel/instructions.py index 277cf62b4a..e5265e2d15 100644 --- a/lib/cretonne/meta/isa/intel/instructions.py +++ b/lib/cretonne/meta/isa/intel/instructions.py @@ -47,6 +47,28 @@ sdivmodx = Instruction( """, ins=(nlo, nhi, d), outs=(q, r), can_trap=True) +argL = Operand('argL', iWord) +argR = Operand('argR', iWord) +resLo = Operand('resLo', iWord) +resHi = Operand('resHi', iWord) + +umulx = Instruction( + 'x86_umulx', r""" + Unsigned integer multiplication, producing a double-length result. + + Polymorphic over all scalar integer types, but does not support vector + types. + """, + ins=(argL, argR), outs=(resLo, resHi)) + +smulx = Instruction( + 'x86_smulx', r""" + Signed integer multiplication, producing a double-length result. + + Polymorphic over all scalar integer types, but does not support vector + types. + """, + ins=(argL, argR), outs=(resLo, resHi)) Float = TypeVar( 'Float', 'A scalar or vector floating point number', diff --git a/lib/cretonne/meta/isa/intel/legalize.py b/lib/cretonne/meta/isa/intel/legalize.py index 32f0a98153..5806bb9284 100644 --- a/lib/cretonne/meta/isa/intel/legalize.py +++ b/lib/cretonne/meta/isa/intel/legalize.py @@ -37,6 +37,23 @@ intel_expand.custom_legalize(insts.srem, 'expand_sdivrem') intel_expand.custom_legalize(insts.udiv, 'expand_udivrem') intel_expand.custom_legalize(insts.urem, 'expand_udivrem') +# +# Double length (widening) multiplication +# +resLo = Var('resLo') +resHi = Var('resHi') +intel_expand.legalize( + resHi << insts.umulhi(x, y), + Rtl( + (resLo, resHi) << x86.umulx(x, y) + )) + +intel_expand.legalize( + resHi << insts.smulhi(x, y), + Rtl( + (resLo, resHi) << x86.smulx(x, y) + )) + # Floating point condition codes. # # The 8 condition codes in `supported_floatccs` are directly supported by a diff --git a/lib/cretonne/meta/isa/intel/recipes.py b/lib/cretonne/meta/isa/intel/recipes.py index 133837fc78..9d03d02053 100644 --- a/lib/cretonne/meta/isa/intel/recipes.py +++ b/lib/cretonne/meta/isa/intel/recipes.py @@ -453,6 +453,15 @@ div = TailRecipe( modrm_r_bits(in_reg2, bits, sink); ''') +# XX /n for {s,u}mulx: inputs in %rax, r. Outputs in %rdx(hi):%rax(lo) +mulx = TailRecipe( + 'mulx', Binary, size=1, + ins=(GPR.rax, GPR), outs=(GPR.rax, GPR.rdx), + emit=''' + PUT_OP(bits, rex1(in_reg1), sink); + modrm_r_bits(in_reg1, bits, sink); + ''') + # XX /n ib with 8-bit immediate sign-extended. rib = TailRecipe( 'rib', BinaryImm, size=2, ins=GPR, outs=0, diff --git a/lib/cretonne/src/context.rs b/lib/cretonne/src/context.rs index 8aff5aa706..78b5d5bebf 100644 --- a/lib/cretonne/src/context.rs +++ b/lib/cretonne/src/context.rs @@ -23,6 +23,7 @@ use unreachable_code::eliminate_unreachable_code; use verifier; use simple_gvn::do_simple_gvn; use licm::do_licm; +use preopt::do_preopt; use timing; /// Persistent data structures and compilation pipeline. @@ -87,6 +88,7 @@ impl Context { self.verify_if(isa)?; self.compute_cfg(); + self.preopt(isa)?; self.legalize(isa)?; /* TODO: Enable additional optimization passes. if isa.flags().opt_level() == OptLevel::Best { @@ -131,6 +133,13 @@ impl Context { } } + /// Perform pre-legalization rewrites on the function. + pub fn preopt(&mut self, isa: &TargetIsa) -> CtonResult { + do_preopt(&mut self.func); + self.verify_if(isa)?; + Ok(()) + } + /// Run the legalizer for `isa` on the function. pub fn legalize(&mut self, isa: &TargetIsa) -> CtonResult { // Legalization invalidates the domtree and loop_analysis by mutating the CFG. diff --git a/lib/cretonne/src/divconst_magic_numbers.rs b/lib/cretonne/src/divconst_magic_numbers.rs new file mode 100644 index 0000000000..64416e48fe --- /dev/null +++ b/lib/cretonne/src/divconst_magic_numbers.rs @@ -0,0 +1,542 @@ +//! Compute "magic numbers" for division-by-constants transformations. + +#![allow(non_snake_case)] + +//---------------------------------------------------------------------- +// +// Math helpers for division by (non-power-of-2) constants. This is based +// on the presentation in "Hacker's Delight" by Henry Warren, 2003. There +// are four cases: {unsigned, signed} x {32 bit, 64 bit}. The word size +// makes little difference, but the signed-vs-unsigned aspect has a large +// effect. Therefore everything is presented in the order U32 U64 S32 S64 +// so as to emphasise the similarity of the U32 and U64 cases and the S32 +// and S64 cases. + +// Structures to hold the "magic numbers" computed. + +#[derive(PartialEq, Debug)] +pub struct MU32 { + pub mulBy: u32, + pub doAdd: bool, + pub shiftBy: i32, +} + +#[derive(PartialEq, Debug)] +pub struct MU64 { + pub mulBy: u64, + pub doAdd: bool, + pub shiftBy: i32, +} + +#[derive(PartialEq, Debug)] +pub struct MS32 { + pub mulBy: i32, + pub shiftBy: i32, +} + +#[derive(PartialEq, Debug)] +pub struct MS64 { + pub mulBy: i64, + pub shiftBy: i32, +} + +// The actual "magic number" generators follow. + +pub fn magicU32(d: u32) -> MU32 { + assert_ne!(d, 0); + assert_ne!(d, 1); // d==1 generates out of range shifts. + + let mut do_add: bool = false; + let mut p: i32 = 31; + let nc: u32 = 0xFFFFFFFFu32 - u32::wrapping_neg(d) % d; + let mut q1: u32 = 0x80000000u32 / nc; + let mut r1: u32 = 0x80000000u32 - q1 * nc; + let mut q2: u32 = 0x7FFFFFFFu32 / d; + let mut r2: u32 = 0x7FFFFFFFu32 - q2 * d; + loop { + p = p + 1; + if r1 >= nc - r1 { + q1 = u32::wrapping_add(u32::wrapping_mul(2, q1), 1); + r1 = u32::wrapping_sub(u32::wrapping_mul(2, r1), nc); + } else { + q1 = 2 * q1; + r1 = 2 * r1; + } + if r2 + 1 >= d - r2 { + if q2 >= 0x7FFFFFFFu32 { + do_add = true; + } + q2 = 2 * q2 + 1; + r2 = u32::wrapping_sub(u32::wrapping_add(u32::wrapping_mul(2, r2), 1), d); + } else { + if q2 >= 0x80000000u32 { + do_add = true; + } + q2 = u32::wrapping_mul(2, q2); + r2 = 2 * r2 + 1; + } + let delta: u32 = d - 1 - r2; + if !(p < 64 && (q1 < delta || (q1 == delta && r1 == 0))) { + break; + } + } + + MU32 { + mulBy: q2 + 1, + doAdd: do_add, + shiftBy: p - 32, + } +} + +pub fn magicU64(d: u64) -> MU64 { + assert_ne!(d, 0); + assert_ne!(d, 1); // d==1 generates out of range shifts. + + let mut do_add: bool = false; + let mut p: i32 = 63; + let nc: u64 = 0xFFFFFFFFFFFFFFFFu64 - u64::wrapping_neg(d) % d; + let mut q1: u64 = 0x8000000000000000u64 / nc; + let mut r1: u64 = 0x8000000000000000u64 - q1 * nc; + let mut q2: u64 = 0x7FFFFFFFFFFFFFFFu64 / d; + let mut r2: u64 = 0x7FFFFFFFFFFFFFFFu64 - q2 * d; + loop { + p = p + 1; + if r1 >= nc - r1 { + q1 = u64::wrapping_add(u64::wrapping_mul(2, q1), 1); + r1 = u64::wrapping_sub(u64::wrapping_mul(2, r1), nc); + } else { + q1 = 2 * q1; + r1 = 2 * r1; + } + if r2 + 1 >= d - r2 { + if q2 >= 0x7FFFFFFFFFFFFFFFu64 { + do_add = true; + } + q2 = 2 * q2 + 1; + r2 = u64::wrapping_sub(u64::wrapping_add(u64::wrapping_mul(2, r2), 1), d); + } else { + if q2 >= 0x8000000000000000u64 { + do_add = true; + } + q2 = u64::wrapping_mul(2, q2); + r2 = 2 * r2 + 1; + } + let delta: u64 = d - 1 - r2; + if !(p < 128 && (q1 < delta || (q1 == delta && r1 == 0))) { + break; + } + } + + MU64 { + mulBy: q2 + 1, + doAdd: do_add, + shiftBy: p - 64, + } +} + +pub fn magicS32(d: i32) -> MS32 { + assert_ne!(d, -1); + assert_ne!(d, 0); + assert_ne!(d, 1); + let two31: u32 = 0x80000000u32; + let mut p: i32 = 31; + let ad: u32 = i32::wrapping_abs(d) as u32; + let t: u32 = two31 + ((d as u32) >> 31); + let anc: u32 = u32::wrapping_sub(t - 1, t % ad); + let mut q1: u32 = two31 / anc; + let mut r1: u32 = two31 - q1 * anc; + let mut q2: u32 = two31 / ad; + let mut r2: u32 = two31 - q2 * ad; + loop { + p = p + 1; + q1 = 2 * q1; + r1 = 2 * r1; + if r1 >= anc { + q1 = q1 + 1; + r1 = r1 - anc; + } + q2 = 2 * q2; + r2 = 2 * r2; + if r2 >= ad { + q2 = q2 + 1; + r2 = r2 - ad; + } + let delta: u32 = ad - r2; + if !(q1 < delta || (q1 == delta && r1 == 0)) { + break; + } + } + + MS32 { + mulBy: (if d < 0 { + u32::wrapping_neg(q2 + 1) + } else { + q2 + 1 + }) as i32, + shiftBy: p - 32, + } +} + +pub fn magicS64(d: i64) -> MS64 { + assert_ne!(d, -1); + assert_ne!(d, 0); + assert_ne!(d, 1); + let two63: u64 = 0x8000000000000000u64; + let mut p: i32 = 63; + let ad: u64 = i64::wrapping_abs(d) as u64; + let t: u64 = two63 + ((d as u64) >> 63); + let anc: u64 = u64::wrapping_sub(t - 1, t % ad); + let mut q1: u64 = two63 / anc; + let mut r1: u64 = two63 - q1 * anc; + let mut q2: u64 = two63 / ad; + let mut r2: u64 = two63 - q2 * ad; + loop { + p = p + 1; + q1 = 2 * q1; + r1 = 2 * r1; + if r1 >= anc { + q1 = q1 + 1; + r1 = r1 - anc; + } + q2 = 2 * q2; + r2 = 2 * r2; + if r2 >= ad { + q2 = q2 + 1; + r2 = r2 - ad; + } + let delta: u64 = ad - r2; + if !(q1 < delta || (q1 == delta && r1 == 0)) { + break; + } + } + + MS64 { + mulBy: (if d < 0 { + u64::wrapping_neg(q2 + 1) + } else { + q2 + 1 + }) as i64, + shiftBy: p - 64, + } +} + +#[cfg(test)] +mod tests { + use super::{magicU32, magicU64, magicS32, magicS64}; + use super::{MU32, MU64, MS32, MS64}; + + fn mkMU32(mulBy: u32, doAdd: bool, shiftBy: i32) -> MU32 { + MU32 { + mulBy, + doAdd, + shiftBy, + } + } + + fn mkMU64(mulBy: u64, doAdd: bool, shiftBy: i32) -> MU64 { + MU64 { + mulBy, + doAdd, + shiftBy, + } + } + + fn mkMS32(mulBy: i32, shiftBy: i32) -> MS32 { + MS32 { mulBy, shiftBy } + } + + fn mkMS64(mulBy: i64, shiftBy: i32) -> MS64 { + MS64 { mulBy, shiftBy } + } + + #[test] + fn test_magicU32() { + assert_eq!(magicU32(2u32), mkMU32(0x80000000u32, false, 0)); + assert_eq!(magicU32(3u32), mkMU32(0xaaaaaaabu32, false, 1)); + assert_eq!(magicU32(4u32), mkMU32(0x40000000u32, false, 0)); + assert_eq!(magicU32(5u32), mkMU32(0xcccccccdu32, false, 2)); + assert_eq!(magicU32(6u32), mkMU32(0xaaaaaaabu32, false, 2)); + assert_eq!(magicU32(7u32), mkMU32(0x24924925u32, true, 3)); + assert_eq!(magicU32(9u32), mkMU32(0x38e38e39u32, false, 1)); + assert_eq!(magicU32(10u32), mkMU32(0xcccccccdu32, false, 3)); + assert_eq!(magicU32(11u32), mkMU32(0xba2e8ba3u32, false, 3)); + assert_eq!(magicU32(12u32), mkMU32(0xaaaaaaabu32, false, 3)); + assert_eq!(magicU32(25u32), mkMU32(0x51eb851fu32, false, 3)); + assert_eq!(magicU32(125u32), mkMU32(0x10624dd3u32, false, 3)); + assert_eq!(magicU32(625u32), mkMU32(0xd1b71759u32, false, 9)); + assert_eq!(magicU32(1337u32), mkMU32(0x88233b2bu32, true, 11)); + assert_eq!(magicU32(65535u32), mkMU32(0x80008001u32, false, 15)); + assert_eq!(magicU32(65536u32), mkMU32(0x00010000u32, false, 0)); + assert_eq!(magicU32(65537u32), mkMU32(0xffff0001u32, false, 16)); + assert_eq!(magicU32(31415927u32), mkMU32(0x445b4553u32, false, 23)); + assert_eq!(magicU32(0xdeadbeefu32), mkMU32(0x93275ab3u32, false, 31)); + assert_eq!(magicU32(0xfffffffdu32), mkMU32(0x40000001u32, false, 30)); + assert_eq!(magicU32(0xfffffffeu32), mkMU32(0x00000003u32, true, 32)); + assert_eq!(magicU32(0xffffffffu32), mkMU32(0x80000001u32, false, 31)); + } + #[test] + fn test_magicU64() { + assert_eq!(magicU64(2u64), mkMU64(0x8000000000000000u64, false, 0)); + assert_eq!(magicU64(3u64), mkMU64(0xaaaaaaaaaaaaaaabu64, false, 1)); + assert_eq!(magicU64(4u64), mkMU64(0x4000000000000000u64, false, 0)); + assert_eq!(magicU64(5u64), mkMU64(0xcccccccccccccccdu64, false, 2)); + assert_eq!(magicU64(6u64), mkMU64(0xaaaaaaaaaaaaaaabu64, false, 2)); + assert_eq!(magicU64(7u64), mkMU64(0x2492492492492493u64, true, 3)); + assert_eq!(magicU64(9u64), mkMU64(0xe38e38e38e38e38fu64, false, 3)); + assert_eq!(magicU64(10u64), mkMU64(0xcccccccccccccccdu64, false, 3)); + assert_eq!(magicU64(11u64), mkMU64(0x2e8ba2e8ba2e8ba3u64, false, 1)); + assert_eq!(magicU64(12u64), mkMU64(0xaaaaaaaaaaaaaaabu64, false, 3)); + assert_eq!(magicU64(25u64), mkMU64(0x47ae147ae147ae15u64, true, 5)); + assert_eq!(magicU64(125u64), mkMU64(0x0624dd2f1a9fbe77u64, true, 7)); + assert_eq!(magicU64(625u64), mkMU64(0x346dc5d63886594bu64, false, 7)); + assert_eq!(magicU64(1337u64), mkMU64(0xc4119d952866a139u64, false, 10)); + assert_eq!( + magicU64(31415927u64), + mkMU64(0x116d154b9c3d2f85u64, true, 25) + ); + assert_eq!( + magicU64(0x00000000deadbeefu64), + mkMU64(0x93275ab2dfc9094bu64, false, 31) + ); + assert_eq!( + magicU64(0x00000000fffffffdu64), + mkMU64(0x8000000180000005u64, false, 31) + ); + assert_eq!( + magicU64(0x00000000fffffffeu64), + mkMU64(0x0000000200000005u64, true, 32) + ); + assert_eq!( + magicU64(0x00000000ffffffffu64), + mkMU64(0x8000000080000001u64, false, 31) + ); + assert_eq!( + magicU64(0x0000000100000000u64), + mkMU64(0x0000000100000000u64, false, 0) + ); + assert_eq!( + magicU64(0x0000000100000001u64), + mkMU64(0xffffffff00000001u64, false, 32) + ); + assert_eq!( + magicU64(0x0ddc0ffeebadf00du64), + mkMU64(0x2788e9d394b77da1u64, true, 60) + ); + assert_eq!( + magicU64(0xfffffffffffffffdu64), + mkMU64(0x4000000000000001u64, false, 62) + ); + assert_eq!( + magicU64(0xfffffffffffffffeu64), + mkMU64(0x0000000000000003u64, true, 64) + ); + assert_eq!( + magicU64(0xffffffffffffffffu64), + mkMU64(0x8000000000000001u64, false, 63) + ); + } + #[test] + fn test_magicS32() { + assert_eq!(magicS32(-0x80000000i32), mkMS32(0x7fffffffu32 as i32, 30)); + assert_eq!(magicS32(-0x7FFFFFFFi32), mkMS32(0xbfffffffu32 as i32, 29)); + assert_eq!(magicS32(-0x7FFFFFFEi32), mkMS32(0x7ffffffdu32 as i32, 30)); + assert_eq!(magicS32(-31415927i32), mkMS32(0xbba4baadu32 as i32, 23)); + assert_eq!(magicS32(-1337i32), mkMS32(0x9df73135u32 as i32, 9)); + assert_eq!(magicS32(-256i32), mkMS32(0x7fffffffu32 as i32, 7)); + assert_eq!(magicS32(-5i32), mkMS32(0x99999999u32 as i32, 1)); + assert_eq!(magicS32(-3i32), mkMS32(0x55555555u32 as i32, 1)); + assert_eq!(magicS32(-2i32), mkMS32(0x7fffffffu32 as i32, 0)); + assert_eq!(magicS32(2i32), mkMS32(0x80000001u32 as i32, 0)); + assert_eq!(magicS32(3i32), mkMS32(0x55555556u32 as i32, 0)); + assert_eq!(magicS32(4i32), mkMS32(0x80000001u32 as i32, 1)); + assert_eq!(magicS32(5i32), mkMS32(0x66666667u32 as i32, 1)); + assert_eq!(magicS32(6i32), mkMS32(0x2aaaaaabu32 as i32, 0)); + assert_eq!(magicS32(7i32), mkMS32(0x92492493u32 as i32, 2)); + assert_eq!(magicS32(9i32), mkMS32(0x38e38e39u32 as i32, 1)); + assert_eq!(magicS32(10i32), mkMS32(0x66666667u32 as i32, 2)); + assert_eq!(magicS32(11i32), mkMS32(0x2e8ba2e9u32 as i32, 1)); + assert_eq!(magicS32(12i32), mkMS32(0x2aaaaaabu32 as i32, 1)); + assert_eq!(magicS32(25i32), mkMS32(0x51eb851fu32 as i32, 3)); + assert_eq!(magicS32(125i32), mkMS32(0x10624dd3u32 as i32, 3)); + assert_eq!(magicS32(625i32), mkMS32(0x68db8badu32 as i32, 8)); + assert_eq!(magicS32(1337i32), mkMS32(0x6208cecbu32 as i32, 9)); + assert_eq!(magicS32(31415927i32), mkMS32(0x445b4553u32 as i32, 23)); + assert_eq!(magicS32(0x7ffffffei32), mkMS32(0x80000003u32 as i32, 30)); + assert_eq!(magicS32(0x7fffffffi32), mkMS32(0x40000001u32 as i32, 29)); + } + #[test] + fn test_magicS64() { + assert_eq!( + magicS64(-0x8000000000000000i64), + mkMS64(0x7fffffffffffffffu64 as i64, 62) + ); + assert_eq!( + magicS64(-0x7FFFFFFFFFFFFFFFi64), + mkMS64(0xbfffffffffffffffu64 as i64, 61) + ); + assert_eq!( + magicS64(-0x7FFFFFFFFFFFFFFEi64), + mkMS64(0x7ffffffffffffffdu64 as i64, 62) + ); + assert_eq!( + magicS64(-0x0ddC0ffeeBadF00di64), + mkMS64(0x6c3b8b1635a4412fu64 as i64, 59) + ); + assert_eq!( + magicS64(-0x100000001i64), + mkMS64(0x800000007fffffffu64 as i64, 31) + ); + assert_eq!( + magicS64(-0x100000000i64), + mkMS64(0x7fffffffffffffffu64 as i64, 31) + ); + assert_eq!( + magicS64(-0xFFFFFFFFi64), + mkMS64(0x7fffffff7fffffffu64 as i64, 31) + ); + assert_eq!( + magicS64(-0xFFFFFFFEi64), + mkMS64(0x7ffffffefffffffdu64 as i64, 31) + ); + assert_eq!( + magicS64(-0xFFFFFFFDi64), + mkMS64(0x7ffffffe7ffffffbu64 as i64, 31) + ); + assert_eq!( + magicS64(-0xDeadBeefi64), + mkMS64(0x6cd8a54d2036f6b5u64 as i64, 31) + ); + assert_eq!( + magicS64(-31415927i64), + mkMS64(0x7749755a31e1683du64 as i64, 24) + ); + assert_eq!(magicS64(-1337i64), mkMS64(0x9df731356bccaf63u64 as i64, 9)); + assert_eq!(magicS64(-256i64), mkMS64(0x7fffffffffffffffu64 as i64, 7)); + assert_eq!(magicS64(-5i64), mkMS64(0x9999999999999999u64 as i64, 1)); + assert_eq!(magicS64(-3i64), mkMS64(0x5555555555555555u64 as i64, 1)); + assert_eq!(magicS64(-2i64), mkMS64(0x7fffffffffffffffu64 as i64, 0)); + assert_eq!(magicS64(2i64), mkMS64(0x8000000000000001u64 as i64, 0)); + assert_eq!(magicS64(3i64), mkMS64(0x5555555555555556u64 as i64, 0)); + assert_eq!(magicS64(4i64), mkMS64(0x8000000000000001u64 as i64, 1)); + assert_eq!(magicS64(5i64), mkMS64(0x6666666666666667u64 as i64, 1)); + assert_eq!(magicS64(6i64), mkMS64(0x2aaaaaaaaaaaaaabu64 as i64, 0)); + assert_eq!(magicS64(7i64), mkMS64(0x4924924924924925u64 as i64, 1)); + assert_eq!(magicS64(9i64), mkMS64(0x1c71c71c71c71c72u64 as i64, 0)); + assert_eq!(magicS64(10i64), mkMS64(0x6666666666666667u64 as i64, 2)); + assert_eq!(magicS64(11i64), mkMS64(0x2e8ba2e8ba2e8ba3u64 as i64, 1)); + assert_eq!(magicS64(12i64), mkMS64(0x2aaaaaaaaaaaaaabu64 as i64, 1)); + assert_eq!(magicS64(25i64), mkMS64(0xa3d70a3d70a3d70bu64 as i64, 4)); + assert_eq!(magicS64(125i64), mkMS64(0x20c49ba5e353f7cfu64 as i64, 4)); + assert_eq!(magicS64(625i64), mkMS64(0x346dc5d63886594bu64 as i64, 7)); + assert_eq!(magicS64(1337i64), mkMS64(0x6208ceca9433509du64 as i64, 9)); + assert_eq!( + magicS64(31415927i64), + mkMS64(0x88b68aa5ce1e97c3u64 as i64, 24) + ); + assert_eq!( + magicS64(0x00000000deadbeefi64), + mkMS64(0x93275ab2dfc9094bu64 as i64, 31) + ); + assert_eq!( + magicS64(0x00000000fffffffdi64), + mkMS64(0x8000000180000005u64 as i64, 31) + ); + assert_eq!( + magicS64(0x00000000fffffffei64), + mkMS64(0x8000000100000003u64 as i64, 31) + ); + assert_eq!( + magicS64(0x00000000ffffffffi64), + mkMS64(0x8000000080000001u64 as i64, 31) + ); + assert_eq!( + magicS64(0x0000000100000000i64), + mkMS64(0x8000000000000001u64 as i64, 31) + ); + assert_eq!( + magicS64(0x0000000100000001i64), + mkMS64(0x7fffffff80000001u64 as i64, 31) + ); + assert_eq!( + magicS64(0x0ddc0ffeebadf00di64), + mkMS64(0x93c474e9ca5bbed1u64 as i64, 59) + ); + assert_eq!( + magicS64(0x7ffffffffffffffdi64), + mkMS64(0x2000000000000001u64 as i64, 60) + ); + assert_eq!( + magicS64(0x7ffffffffffffffei64), + mkMS64(0x8000000000000003u64 as i64, 62) + ); + assert_eq!( + magicS64(0x7fffffffffffffffi64), + mkMS64(0x4000000000000001u64 as i64, 61) + ); + } + #[test] + fn test_magic_generators_dont_panic() { + // The point of this is to check that the magic number generators + // don't panic with integer wraparounds, especially at boundary + // cases for their arguments. The actual results are thrown away. + let mut total: u64 = 0; + println!("Testing UP magicU32"); + for x in 2..(200 * 1000u32) { + let m = magicU32(x); + total = total ^ (m.mulBy as u64); + total = total + (m.shiftBy as u64); + total = total - (if m.doAdd { 123 } else { 456 }); + } + println!("Testing DOWN magicU32"); + for x in 0..(200 * 1000u32) { + let m = magicU32(0xFFFF_FFFFu32 - x); + total = total ^ (m.mulBy as u64); + total = total + (m.shiftBy as u64); + total = total - (if m.doAdd { 123 } else { 456 }); + } + + println!("Testing UP magicU64"); + for x in 2..(200 * 1000u64) { + let m = magicU64(x); + total = total ^ (m.mulBy as u64); + total = total + (m.shiftBy as u64); + total = total - (if m.doAdd { 123 } else { 456 }); + } + println!("Testing DOWN magicU64"); + for x in 0..(200 * 1000u64) { + let m = magicU64(0xFFFF_FFFF_FFFF_FFFFu64 - x); + total = total ^ (m.mulBy as u64); + total = total + (m.shiftBy as u64); + total = total - (if m.doAdd { 123 } else { 456 }); + } + + println!("Testing UP magicS32"); + for x in 0..(200 * 1000i32) { + let m = magicS32(-0x8000_0000i32 + x); + total = total ^ (m.mulBy as u64); + total = total + (m.shiftBy as u64); + } + println!("Testing DOWN magicS32"); + for x in 0..(200 * 1000i32) { + let m = magicS32(0x7FFF_FFFFi32 - x); + total = total ^ (m.mulBy as u64); + total = total + (m.shiftBy as u64); + } + + println!("Testing UP magicS64"); + for x in 0..(200 * 1000i64) { + let m = magicS64(-0x8000_0000_0000_0000i64 + x); + total = total ^ (m.mulBy as u64); + total = total + (m.shiftBy as u64); + } + println!("Testing DOWN magicS64"); + for x in 0..(200 * 1000i64) { + let m = magicS64(0x7FFF_FFFF_FFFF_FFFFi64 - x); + total = total ^ (m.mulBy as u64); + total = total + (m.shiftBy as u64); + } + // Force `total` -- and hence, the entire computation -- to + // be used, so that rustc can't optimise it out. + assert_eq!(total, 7547519887532559585u64); + } +} diff --git a/lib/cretonne/src/isa/intel/enc_tables.rs b/lib/cretonne/src/isa/intel/enc_tables.rs index 1a602cf8a1..00c5ab99cb 100644 --- a/lib/cretonne/src/isa/intel/enc_tables.rs +++ b/lib/cretonne/src/isa/intel/enc_tables.rs @@ -1,5 +1,6 @@ //! Encoding tables for Intel ISAs. +use bitset::BitSet; use cursor::{Cursor, FuncCursor}; use flowgraph::ControlFlowGraph; use ir::{self, InstBuilder}; diff --git a/lib/cretonne/src/lib.rs b/lib/cretonne/src/lib.rs index b4c386477c..f34bed638b 100644 --- a/lib/cretonne/src/lib.rs +++ b/lib/cretonne/src/lib.rs @@ -33,11 +33,13 @@ mod abi; mod bitset; mod constant_hash; mod context; +mod divconst_magic_numbers; mod iterators; mod legalizer; mod licm; mod partition_slice; mod predicates; +mod preopt; mod ref_slice; mod regalloc; mod scoped_hash_map; diff --git a/lib/cretonne/src/preopt.rs b/lib/cretonne/src/preopt.rs new file mode 100644 index 0000000000..0e4582d7a7 --- /dev/null +++ b/lib/cretonne/src/preopt.rs @@ -0,0 +1,521 @@ +//! A pre-legalization rewriting pass. + +#![allow(non_snake_case)] + +use cursor::{Cursor, FuncCursor}; +use ir::dfg::ValueDef; +use ir::{Function, InstructionData, Value, DataFlowGraph, InstBuilder, Type}; +use ir::Inst; +use ir::types::{I32, I64}; +use ir::instructions::Opcode; +use divconst_magic_numbers::{MU32, MU64, MS32, MS64}; +use divconst_magic_numbers::{magicU32, magicU64, magicS32, magicS64}; +use timing; + + +//---------------------------------------------------------------------- +// +// Pattern-match helpers and transformation for div and rem by constants. + +// Simple math helpers + +// if `x` is a power of two, or the negation thereof, return the power along +// with a boolean that indicates whether `x` is negative. Else return None. +#[inline] +fn isPowerOf2_S32(x: i32) -> Option<(bool, u32)> { + // We have to special-case this because abs(x) isn't representable. + if x == -0x8000_0000 { + return Some((true, 31)); + } + let abs_x = i32::wrapping_abs(x) as u32; + if abs_x.is_power_of_two() { + return Some((x < 0, abs_x.trailing_zeros())); + } + None +} + +// Same comments as for isPowerOf2_S64 apply. +#[inline] +fn isPowerOf2_S64(x: i64) -> Option<(bool, u32)> { + // We have to special-case this because abs(x) isn't representable. + if x == -0x8000_0000_0000_0000 { + return Some((true, 63)); + } + let abs_x = i64::wrapping_abs(x) as u64; + if abs_x.is_power_of_two() { + return Some((x < 0, abs_x.trailing_zeros())); + } + None +} + +#[derive(Debug)] +enum DivRemByConstInfo { + DivU32(Value, u32), // In all cases, the arguments are: + DivU64(Value, u64), // left operand, right operand + DivS32(Value, i32), + DivS64(Value, i64), + RemU32(Value, u32), + RemU64(Value, u64), + RemS32(Value, i32), + RemS64(Value, i64), +} + +// Possibly create a DivRemByConstInfo from the given components, by +// figuring out which, if any, of the 8 cases apply, and also taking care to +// sanity-check the immediate. +fn package_up_divrem_info( + argL: Value, + argL_ty: Type, + argRs: i64, + isSigned: bool, + isRem: bool, +) -> Option { + let argRu: u64 = argRs as u64; + if !isSigned && argL_ty == I32 && argRu < 0x1_0000_0000 { + let con = if isRem { + DivRemByConstInfo::RemU32 + } else { + DivRemByConstInfo::DivU32 + }; + return Some(con(argL, argRu as u32)); + } + if !isSigned && argL_ty == I64 { + // unsigned 64, no range constraint + let con = if isRem { + DivRemByConstInfo::RemU64 + } else { + DivRemByConstInfo::DivU64 + }; + return Some(con(argL, argRu)); + } + if isSigned && argL_ty == I32 && (argRu <= 0x7fff_ffff || argRu >= 0xffff_ffff_8000_0000) { + let con = if isRem { + DivRemByConstInfo::RemS32 + } else { + DivRemByConstInfo::DivS32 + }; + return Some(con(argL, argRu as i32)); + } + if isSigned && argL_ty == I64 { + // signed 64, no range constraint + let con = if isRem { + DivRemByConstInfo::RemS64 + } else { + DivRemByConstInfo::DivS64 + }; + return Some(con(argL, argRu as i64)); + } + None +} + +// Examine `idata` to see if it is a div or rem by a constant, and if so +// return the operands, signedness, operation size and div-vs-rem-ness in a +// handy bundle. +fn get_div_info(inst: Inst, dfg: &DataFlowGraph) -> Option { + let idata: &InstructionData = &dfg[inst]; + + if let &InstructionData::BinaryImm { opcode, arg, imm } = idata { + let (isSigned, isRem) = match opcode { + Opcode::UdivImm => (false, false), + Opcode::UremImm => (false, true), + Opcode::SdivImm => (true, false), + Opcode::SremImm => (true, true), + _other => return None, + }; + // Pull the operation size (type) from the left arg + let argL_ty = dfg.value_type(arg); + return package_up_divrem_info(arg, argL_ty, imm.into(), isSigned, isRem); + } + + // TODO: should we actually bother to do this (that is, manually match + // the case that the second argument is an iconst)? Or should we assume + // that some previous constant propagation pass has pushed all such + // immediates to their use points, creating BinaryImm instructions + // instead? For now we take the conservative approach. + if let &InstructionData::Binary { opcode, args } = idata { + let (isSigned, isRem) = match opcode { + Opcode::Udiv => (false, false), + Opcode::Urem => (false, true), + Opcode::Sdiv => (true, false), + Opcode::Srem => (true, true), + _other => return None, + }; + let argR: Value = args[1]; + if let Some(simm64) = get_const(argR, dfg) { + let argL: Value = args[0]; + // Pull the operation size (type) from the left arg + let argL_ty = dfg.value_type(argL); + return package_up_divrem_info(argL, argL_ty, simm64, isSigned, isRem); + } + } + + None +} + +// Actually do the transformation given a bundle containing the relevant +// information. `divrem_info` describes a div or rem by a constant, that +// `pos` currently points at, and `inst` is the associated instruction. +// `inst` is replaced by a sequence of other operations that calculate the +// same result. Note that there are various `divrem_info` cases where we +// cannot do any transformation, in which case `inst` is left unchanged. +fn do_divrem_transformation(divrem_info: &DivRemByConstInfo, pos: &mut FuncCursor, inst: Inst) { + let isRem = match *divrem_info { + DivRemByConstInfo::DivU32(_, _) | + DivRemByConstInfo::DivU64(_, _) | + DivRemByConstInfo::DivS32(_, _) | + DivRemByConstInfo::DivS64(_, _) => false, + DivRemByConstInfo::RemU32(_, _) | + DivRemByConstInfo::RemU64(_, _) | + DivRemByConstInfo::RemS32(_, _) | + DivRemByConstInfo::RemS64(_, _) => true, + }; + + match divrem_info { + + // -------------------- U32 -------------------- + + // U32 div, rem by zero: ignore + &DivRemByConstInfo::DivU32(_n1, 0) | + &DivRemByConstInfo::RemU32(_n1, 0) => {} + + // U32 div by 1: identity + // U32 rem by 1: zero + &DivRemByConstInfo::DivU32(n1, 1) | + &DivRemByConstInfo::RemU32(n1, 1) => { + if isRem { + pos.func.dfg.replace(inst).iconst(I32, 0); + } else { + pos.func.dfg.replace(inst).copy(n1); + } + } + + // U32 div, rem by a power-of-2 + &DivRemByConstInfo::DivU32(n1, d) | + &DivRemByConstInfo::RemU32(n1, d) if d.is_power_of_two() => { + assert!(d >= 2); + // compute k where d == 2^k + let k = d.trailing_zeros(); + assert!(k >= 1 && k <= 31); + if isRem { + let mask = (1u64 << k) - 1; + pos.func.dfg.replace(inst).band_imm(n1, mask as i64); + } else { + pos.func.dfg.replace(inst).ushr_imm(n1, k as i64); + } + } + + // U32 div, rem by non-power-of-2 + &DivRemByConstInfo::DivU32(n1, d) | + &DivRemByConstInfo::RemU32(n1, d) => { + assert!(d >= 3); + let MU32 { + mulBy, + doAdd, + shiftBy, + } = magicU32(d); + let qf; // final quotient + let q0 = pos.ins().iconst(I32, mulBy as i64); + let q1 = pos.ins().umulhi(n1, q0); + if doAdd { + assert!(shiftBy >= 1 && shiftBy <= 32); + let t1 = pos.ins().isub(n1, q1); + let t2 = pos.ins().ushr_imm(t1, 1); + let t3 = pos.ins().iadd(t2, q1); + // I never found any case where shiftBy == 1 here. + // So there's no attempt to fold out a zero shift. + debug_assert!(shiftBy != 1); + qf = pos.ins().ushr_imm(t3, (shiftBy - 1) as i64); + } else { + assert!(shiftBy >= 0 && shiftBy <= 31); + // Whereas there are known cases here for shiftBy == 0. + if shiftBy > 0 { + qf = pos.ins().ushr_imm(q1, shiftBy as i64); + } else { + qf = q1; + } + } + // Now qf holds the final quotient. If necessary calculate the + // remainder instead. + if isRem { + let tt = pos.ins().imul_imm(qf, d as i64); + pos.func.dfg.replace(inst).isub(n1, tt); + } else { + pos.func.dfg.replace(inst).copy(qf); + } + } + + // -------------------- U64 -------------------- + + // U64 div, rem by zero: ignore + &DivRemByConstInfo::DivU64(_n1, 0) | + &DivRemByConstInfo::RemU64(_n1, 0) => {} + + // U64 div by 1: identity + // U64 rem by 1: zero + &DivRemByConstInfo::DivU64(n1, 1) | + &DivRemByConstInfo::RemU64(n1, 1) => { + if isRem { + pos.func.dfg.replace(inst).iconst(I64, 0); + } else { + pos.func.dfg.replace(inst).copy(n1); + } + } + + // U64 div, rem by a power-of-2 + &DivRemByConstInfo::DivU64(n1, d) | + &DivRemByConstInfo::RemU64(n1, d) if d.is_power_of_two() => { + assert!(d >= 2); + // compute k where d == 2^k + let k = d.trailing_zeros(); + assert!(k >= 1 && k <= 63); + if isRem { + let mask = (1u64 << k) - 1; + pos.func.dfg.replace(inst).band_imm(n1, mask as i64); + } else { + pos.func.dfg.replace(inst).ushr_imm(n1, k as i64); + } + } + + // U64 div, rem by non-power-of-2 + &DivRemByConstInfo::DivU64(n1, d) | + &DivRemByConstInfo::RemU64(n1, d) => { + assert!(d >= 3); + let MU64 { + mulBy, + doAdd, + shiftBy, + } = magicU64(d); + let qf; // final quotient + let q0 = pos.ins().iconst(I64, mulBy as i64); + let q1 = pos.ins().umulhi(n1, q0); + if doAdd { + assert!(shiftBy >= 1 && shiftBy <= 64); + let t1 = pos.ins().isub(n1, q1); + let t2 = pos.ins().ushr_imm(t1, 1); + let t3 = pos.ins().iadd(t2, q1); + // I never found any case where shiftBy == 1 here. + // So there's no attempt to fold out a zero shift. + debug_assert!(shiftBy != 1); + qf = pos.ins().ushr_imm(t3, (shiftBy - 1) as i64); + } else { + assert!(shiftBy >= 0 && shiftBy <= 63); + // Whereas there are known cases here for shiftBy == 0. + if shiftBy > 0 { + qf = pos.ins().ushr_imm(q1, shiftBy as i64); + } else { + qf = q1; + } + } + // Now qf holds the final quotient. If necessary calculate the + // remainder instead. + if isRem { + let tt = pos.ins().imul_imm(qf, d as i64); + pos.func.dfg.replace(inst).isub(n1, tt); + } else { + pos.func.dfg.replace(inst).copy(qf); + } + } + + // -------------------- S32 -------------------- + + // S32 div, rem by zero or -1: ignore + &DivRemByConstInfo::DivS32(_n1, -1) | + &DivRemByConstInfo::RemS32(_n1, -1) | + &DivRemByConstInfo::DivS32(_n1, 0) | + &DivRemByConstInfo::RemS32(_n1, 0) => {} + + // S32 div by 1: identity + // S32 rem by 1: zero + &DivRemByConstInfo::DivS32(n1, 1) | + &DivRemByConstInfo::RemS32(n1, 1) => { + if isRem { + pos.func.dfg.replace(inst).iconst(I32, 0); + } else { + pos.func.dfg.replace(inst).copy(n1); + } + } + + &DivRemByConstInfo::DivS32(n1, d) | + &DivRemByConstInfo::RemS32(n1, d) => { + if let Some((isNeg, k)) = isPowerOf2_S32(d) { + // k can be 31 only in the case that d is -2^31. + assert!(k >= 1 && k <= 31); + let t1 = if k - 1 == 0 { + n1 + } else { + pos.ins().sshr_imm(n1, (k - 1) as i64) + }; + let t2 = pos.ins().ushr_imm(t1, (32 - k) as i64); + let t3 = pos.ins().iadd(n1, t2); + if isRem { + // S32 rem by a power-of-2 + let t4 = pos.ins().band_imm(t3, i32::wrapping_neg(1 << k) as i64); + // Curiously, we don't care here what the sign of d is. + pos.func.dfg.replace(inst).isub(n1, t4); + } else { + // S32 div by a power-of-2 + let t4 = pos.ins().sshr_imm(t3, k as i64); + if isNeg { + pos.func.dfg.replace(inst).irsub_imm(t4, 0); + } else { + pos.func.dfg.replace(inst).copy(t4); + } + } + } else { + // S32 div, rem by a non-power-of-2 + assert!(d < -2 || d > 2); + let MS32 { mulBy, shiftBy } = magicS32(d); + let q0 = pos.ins().iconst(I32, mulBy as i64); + let q1 = pos.ins().smulhi(n1, q0); + let q2 = if d > 0 && mulBy < 0 { + pos.ins().iadd(q1, n1) + } else if d < 0 && mulBy > 0 { + pos.ins().isub(q1, n1) + } else { + q1 + }; + assert!(shiftBy >= 0 && shiftBy <= 31); + let q3 = if shiftBy == 0 { + q2 + } else { + pos.ins().sshr_imm(q2, shiftBy as i64) + }; + let t1 = pos.ins().ushr_imm(q3, 31); + let qf = pos.ins().iadd(q3, t1); + // Now qf holds the final quotient. If necessary calculate + // the remainder instead. + if isRem { + let tt = pos.ins().imul_imm(qf, d as i64); + pos.func.dfg.replace(inst).isub(n1, tt); + } else { + pos.func.dfg.replace(inst).copy(qf); + } + } + } + + // -------------------- S64 -------------------- + + // S64 div, rem by zero or -1: ignore + &DivRemByConstInfo::DivS64(_n1, -1) | + &DivRemByConstInfo::RemS64(_n1, -1) | + &DivRemByConstInfo::DivS64(_n1, 0) | + &DivRemByConstInfo::RemS64(_n1, 0) => {} + + // S64 div by 1: identity + // S64 rem by 1: zero + &DivRemByConstInfo::DivS64(n1, 1) | + &DivRemByConstInfo::RemS64(n1, 1) => { + if isRem { + pos.func.dfg.replace(inst).iconst(I64, 0); + } else { + pos.func.dfg.replace(inst).copy(n1); + } + } + + &DivRemByConstInfo::DivS64(n1, d) | + &DivRemByConstInfo::RemS64(n1, d) => { + if let Some((isNeg, k)) = isPowerOf2_S64(d) { + // k can be 63 only in the case that d is -2^63. + assert!(k >= 1 && k <= 63); + let t1 = if k - 1 == 0 { + n1 + } else { + pos.ins().sshr_imm(n1, (k - 1) as i64) + }; + let t2 = pos.ins().ushr_imm(t1, (64 - k) as i64); + let t3 = pos.ins().iadd(n1, t2); + if isRem { + // S64 rem by a power-of-2 + let t4 = pos.ins().band_imm(t3, i64::wrapping_neg(1 << k)); + // Curiously, we don't care here what the sign of d is. + pos.func.dfg.replace(inst).isub(n1, t4); + } else { + // S64 div by a power-of-2 + let t4 = pos.ins().sshr_imm(t3, k as i64); + if isNeg { + pos.func.dfg.replace(inst).irsub_imm(t4, 0); + } else { + pos.func.dfg.replace(inst).copy(t4); + } + } + } else { + // S64 div, rem by a non-power-of-2 + assert!(d < -2 || d > 2); + let MS64 { mulBy, shiftBy } = magicS64(d); + let q0 = pos.ins().iconst(I64, mulBy); + let q1 = pos.ins().smulhi(n1, q0); + let q2 = if d > 0 && mulBy < 0 { + pos.ins().iadd(q1, n1) + } else if d < 0 && mulBy > 0 { + pos.ins().isub(q1, n1) + } else { + q1 + }; + assert!(shiftBy >= 0 && shiftBy <= 63); + let q3 = if shiftBy == 0 { + q2 + } else { + pos.ins().sshr_imm(q2, shiftBy as i64) + }; + let t1 = pos.ins().ushr_imm(q3, 63); + let qf = pos.ins().iadd(q3, t1); + // Now qf holds the final quotient. If necessary calculate + // the remainder instead. + if isRem { + let tt = pos.ins().imul_imm(qf, d); + pos.func.dfg.replace(inst).isub(n1, tt); + } else { + pos.func.dfg.replace(inst).copy(qf); + } + } + } + + } +} + + +//---------------------------------------------------------------------- +// +// General pattern-match helpers. + +// Find out if `value` actually resolves to a constant, and if so what its +// value is. +fn get_const(value: Value, dfg: &DataFlowGraph) -> Option { + match dfg.value_def(value) { + ValueDef::Result(definingInst, resultNo) => { + let definingIData: &InstructionData = &dfg[definingInst]; + if let &InstructionData::UnaryImm { opcode, imm } = definingIData { + if opcode == Opcode::Iconst && resultNo == 0 { + return Some(imm.into()); + } + } + None + } + ValueDef::Param(_definingEbb, _paramNo) => None, + } +} + + +//---------------------------------------------------------------------- +// +// The main pre-opt pass. + +pub fn do_preopt(func: &mut Function) { + let _tt = timing::preopt(); + let mut pos = FuncCursor::new(func); + while let Some(_ebb) = pos.next_ebb() { + + while let Some(inst) = pos.next_inst() { + + //-- BEGIN -- division by constants ---------------- + + let mb_dri = get_div_info(inst, &pos.func.dfg); + if let Some(divrem_info) = mb_dri { + do_divrem_transformation(&divrem_info, &mut pos, inst); + continue; + } + + //-- END -- division by constants ------------------ + } + } +} diff --git a/lib/cretonne/src/timing.rs b/lib/cretonne/src/timing.rs index d120d43c32..5419895fdc 100644 --- a/lib/cretonne/src/timing.rs +++ b/lib/cretonne/src/timing.rs @@ -55,6 +55,7 @@ define_passes!{ flowgraph: "Control flow graph", domtree: "Dominator tree", loop_analysis: "Loop analysis", + preopt: "Pre-legalization rewriting", legalize: "Legalization", gvn: "Global value numbering", licm: "Loop invariant code motion",