diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 51775e0a23..72aaab9276 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1371,7 +1371,7 @@ pub(crate) fn emit( RegMemImm::Imm { simm32 } => { // FIXME JRS 2020Feb11: there are shorter encodings for // cmp $imm, rax/eax/ax/al. - let use_imm8 = low8_will_sign_extend_to_32(*simm32); + let use_imm8 = is_cmp && low8_will_sign_extend_to_32(*simm32); // And also here we use the "normal" G-E ordering. let opcode = if is_cmp { diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 3178b1dd11..f5d7535da4 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -2999,6 +2999,11 @@ fn test_x64_emit() { "48855763", "testq 99(%rdi), %rdx", )); + insns.push(( + Inst::test_rmi_r(OperandSize::Size64, RegMemImm::imm(127), rdx), + "48F7C27F000000", + "testq $127, %rdx", + )); insns.push(( Inst::test_rmi_r(OperandSize::Size64, RegMemImm::imm(76543210), rdx), "48F7C2EAF48F04", diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index c720b28a32..019c416597 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -886,8 +886,10 @@ fn emit_shl_i128>( // sub amt, amt_src // mov tmp3, src_lo // shr tmp3, amt - // or tmp3, tmp2 // xor dst_lo, dst_lo + // test amt_src, 127 + // cmovz tmp2, dst_lo + // or tmp3, tmp2 // mov amt, amt_src // and amt, 64 // cmovz dst_hi, tmp3 @@ -945,6 +947,24 @@ fn emit_shl_i128>( None, tmp3, )); + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst_lo.to_reg()), + dst_lo, + )); + + ctx.emit(Inst::test_rmi_r( + OperandSize::Size64, + RegMemImm::imm(127), + amt_src, + )); + ctx.emit(Inst::cmove( + OperandSize::Size64, + CC::Z, + RegMem::reg(dst_lo.to_reg()), + tmp2, + )); ctx.emit(Inst::alu_rmi_r( OperandSize::Size64, @@ -953,12 +973,6 @@ fn emit_shl_i128>( tmp3, )); - ctx.emit(Inst::alu_rmi_r( - OperandSize::Size64, - AluRmiROpcode::Xor, - RegMemImm::reg(dst_lo.to_reg()), - dst_lo, - )); // This isn't semantically necessary, but it keeps the // register allocator happy, because it cannot otherwise // infer that cmovz + cmovnz always defines dst_hi. @@ -1011,11 +1025,14 @@ fn emit_shr_i128>( // mov tmp1, src_hi // {u,s}shr tmp1, amt_src // mov tmp2, src_lo - // {u,s}shr tmp2, amt_src + // ushr tmp2, amt_src // mov amt, 64 // sub amt, amt_src // mov tmp3, src_hi // shl tmp3, amt + // xor dst_lo, dst_lo + // test amt_src, 127 + // cmovz tmp3, dst_lo // or tmp3, tmp2 // if is_signed: // mov dst_hi, src_hi @@ -1053,7 +1070,13 @@ fn emit_shr_i128>( amt_src, types::I64, )); - ctx.emit(Inst::shift_r(OperandSize::Size64, shift_kind, None, tmp2)); + // N.B.: right-shift of *lower* half is *always* unsigned (its MSB is not a sign bit). + ctx.emit(Inst::shift_r( + OperandSize::Size64, + ShiftKind::ShiftRightLogical, + None, + tmp2, + )); ctx.emit(Inst::imm(OperandSize::Size64, 64, amt)); ctx.emit(Inst::alu_rmi_r( @@ -1076,6 +1099,24 @@ fn emit_shr_i128>( tmp3, )); + ctx.emit(Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst_lo.to_reg()), + dst_lo, + )); + ctx.emit(Inst::test_rmi_r( + OperandSize::Size64, + RegMemImm::imm(127), + amt_src, + )); + ctx.emit(Inst::cmove( + OperandSize::Size64, + CC::Z, + RegMem::reg(dst_lo.to_reg()), + tmp3, + )); + ctx.emit(Inst::alu_rmi_r( OperandSize::Size64, AluRmiROpcode::Or, @@ -1957,7 +1998,9 @@ fn lower_insn_to_regs>( let cst = (cst as u8) & (dst_ty.bits() as u8 - 1); (Some(cst), None) } else { - (None, Some(put_input_in_reg(ctx, inputs[1]))) + // We can ignore upper registers if shift amount is multi-reg, because we + // are taking the shift amount mod 2^(lhs_width) anyway. + (None, Some(put_input_in_regs(ctx, inputs[1]).regs()[0])) }; let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); diff --git a/cranelift/filetests/filetests/isa/x64/i128.clif b/cranelift/filetests/filetests/isa/x64/i128.clif index 5e5d2ffb86..9b1436898e 100644 --- a/cranelift/filetests/filetests/isa/x64/i128.clif +++ b/cranelift/filetests/filetests/isa/x64/i128.clif @@ -644,212 +644,9 @@ block0(v0: i128): ; nextln: ret } -function %f21(i128, i32) -> i128 { -; check: pushq %rbp -; nextln: movq %rsp, %rbp +; Shifts are covered by run-tests in shift-i128-run.clif. -block0(v0: i128, v1: i32): - v2 = ushr v0, v1 - return v2 - -; check: movq %rdi, %rax -; nextln: movq %rsi, %rdi -; nextln: movq %rdi, %rsi -; nextln: movq %rdx, %rcx -; nextln: shrq %cl, %rsi -; nextln: movq %rdx, %rcx -; nextln: shrq %cl, %rax -; nextln: movl $$64, %ecx -; nextln: subq %rdx, %rcx -; nextln: shlq %cl, %rdi -; nextln: orq %rax, %rdi -; nextln: xorq %rax, %rax -; nextln: xorq %rcx, %rcx -; nextln: andq $$64, %rdx -; nextln: cmovzq %rsi, %rax -; nextln: cmovzq %rdi, %rcx -; nextln: cmovnzq %rsi, %rcx -; nextln: movq %rax, %rdx -; nextln: movq %rcx, %rax - -; nextln: movq %rbp, %rsp -; nextln: popq %rbp -; nextln: ret -} - -function %f22(i128, i32) -> i128 { -; check: pushq %rbp -; nextln: movq %rsp, %rbp - -block0(v0: i128, v1: i32): - v2 = ishl v0, v1 - return v2 - -; check: movq %rsi, %rax -; nextln: movq %rdi, %rsi -; nextln: movq %rdx, %rcx -; nextln: shlq %cl, %rsi -; nextln: movq %rdx, %rcx -; nextln: shlq %cl, %rax -; nextln: movl $$64, %ecx -; nextln: subq %rdx, %rcx -; nextln: shrq %cl, %rdi -; nextln: orq %rax, %rdi -; nextln: xorq %rax, %rax -; nextln: xorq %rcx, %rcx -; nextln: andq $$64, %rdx -; nextln: cmovzq %rdi, %rcx -; nextln: cmovzq %rsi, %rax -; nextln: cmovnzq %rsi, %rcx -; nextln: movq %rcx, %rdx - -; nextln: movq %rbp, %rsp -; nextln: popq %rbp -; nextln: ret -} - -function %f23(i128, i32) -> i128 { -; check: pushq %rbp -; nextln: movq %rsp, %rbp - -block0(v0: i128, v1: i32): - v2 = sshr v0, v1 - return v2 - -; check: movq %rdi, %r8 -; nextln: movq %rsi, %rdi -; nextln: movq %rdi, %rsi -; nextln: movq %rdx, %rcx -; nextln: sarq %cl, %rsi -; nextln: movq %rdx, %rcx -; nextln: sarq %cl, %r8 -; nextln: movl $$64, %ecx -; nextln: subq %rdx, %rcx -; nextln: movq %rdi, %rax -; nextln: shlq %cl, %rax -; nextln: orq %r8, %rax -; nextln: sarq $$63, %rdi -; nextln: xorq %rcx, %rcx -; nextln: andq $$64, %rdx -; nextln: cmovzq %rsi, %rdi -; nextln: cmovzq %rax, %rcx -; nextln: cmovnzq %rsi, %rcx -; nextln: movq %rcx, %rax -; nextln: movq %rdi, %rdx - -; nextln: movq %rbp, %rsp -; nextln: popq %rbp -; nextln: ret -} - -function %f24(i128, i32) -> i128 { -; check: pushq %rbp -; nextln: movq %rsp, %rbp - -block0(v0: i128, v1: i32): - v2 = rotr.i128 v0, v1 - return v2 - -; check: movq %rsi, %r9 -; nextln: movq %rdx, %rcx -; nextln: shrq %cl, %r9 -; nextln: movq %rdi, %rax -; nextln: movq %rdx, %rcx -; nextln: shrq %cl, %rax -; nextln: movl $$64, %ecx -; nextln: subq %rdx, %rcx -; nextln: movq %rsi, %r10 -; nextln: shlq %cl, %r10 -; nextln: orq %rax, %r10 -; nextln: xorq %r8, %r8 -; nextln: xorq %rax, %rax -; nextln: movq %rdx, %rcx -; nextln: andq $$64, %rcx -; nextln: cmovzq %r9, %r8 -; nextln: cmovzq %r10, %rax -; nextln: cmovnzq %r9, %rax -; nextln: movl $$128, %r9d -; nextln: subq %rdx, %r9 -; nextln: movq %rdi, %rdx -; nextln: movq %r9, %rcx -; nextln: shlq %cl, %rdx -; nextln: movq %r9, %rcx -; nextln: shlq %cl, %rsi -; nextln: movl $$64, %ecx -; nextln: subq %r9, %rcx -; nextln: movq %rdi, %r10 -; nextln: shrq %cl, %r10 -; nextln: orq %rsi, %r10 -; nextln: xorq %rsi, %rsi -; nextln: xorq %rdi, %rdi -; nextln: andq $$64, %r9 -; nextln: cmovzq %r10, %rdi -; nextln: cmovzq %rdx, %rsi -; nextln: cmovnzq %rdx, %rdi -; nextln: orq %rax, %rsi -; nextln: orq %r8, %rdi -; nextln: movq %rsi, %rax -; nextln: movq %rdi, %rdx - -; nextln: movq %rbp, %rsp -; nextln: popq %rbp -; nextln: ret -} - -function %f25(i128, i32) -> i128 { -; check: pushq %rbp -; nextln: movq %rsp, %rbp - -block0(v0: i128, v1: i32): - v2 = rotl.i128 v0, v1 - return v2 - -; check: movq %rdi, %r9 -; nextln: movq %rdx, %rcx -; nextln: shlq %cl, %r9 -; nextln: movq %rsi, %rax -; nextln: movq %rdx, %rcx -; nextln: shlq %cl, %rax -; nextln: movl $$64, %ecx -; nextln: subq %rdx, %rcx -; nextln: movq %rdi, %r10 -; nextln: shrq %cl, %r10 -; nextln: orq %rax, %r10 -; nextln: xorq %r8, %r8 -; nextln: xorq %rax, %rax -; nextln: movq %rdx, %rcx -; nextln: andq $$64, %rcx -; nextln: cmovzq %r10, %rax -; nextln: cmovzq %r9, %r8 -; nextln: cmovnzq %r9, %rax -; nextln: movl $$128, %r9d -; nextln: subq %rdx, %r9 -; nextln: movq %rsi, %rdx -; nextln: movq %r9, %rcx -; nextln: shrq %cl, %rdx -; nextln: movq %r9, %rcx -; nextln: shrq %cl, %rdi -; nextln: movl $$64, %ecx -; nextln: subq %r9, %rcx -; nextln: shlq %cl, %rsi -; nextln: orq %rdi, %rsi -; nextln: xorq %rdi, %rdi -; nextln: xorq %rcx, %rcx -; nextln: andq $$64, %r9 -; nextln: cmovzq %rdx, %rdi -; nextln: cmovzq %rsi, %rcx -; nextln: cmovnzq %rdx, %rcx -; nextln: orq %r8, %rcx -; nextln: orq %rax, %rdi -; nextln: movq %rcx, %rax -; nextln: movq %rdi, %rdx - -; nextln: movq %rbp, %rsp -; nextln: popq %rbp -; nextln: ret -} - -function %f26(i128, i64) { +function %f21(i128, i64) { ; check: pushq %rbp ; nextln: movq %rsp, %rbp @@ -865,7 +662,7 @@ block0(v0: i128, v1: i64): ; nextln: ret } -function %f27(i64) -> i128 { +function %f22(i64) -> i128 { ; check: pushq %rbp ; nextln: movq %rsp, %rbp @@ -883,7 +680,7 @@ block0(v0: i64): ; nextln: ret } -function %f28(i128, b1) -> i128 { +function %f23(i128, b1) -> i128 { block0(v0: i128, v1: b1): v2 = iconst.i128 0 brnz v1, block1(v2) @@ -930,7 +727,7 @@ block2(v6: i128): } -function %f29(i128, i128, i64, i128, i128, i128) -> i128 { +function %f24(i128, i128, i64, i128, i128, i128) -> i128 { block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128): v6 = iadd.i128 v0, v1 @@ -974,7 +771,7 @@ block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128): } -function %f30(i128) -> i128, i128, i128, i64, i128, i128 { +function %f25(i128) -> i128, i128, i128, i64, i128, i128 { ; check: pushq %rbp ; nextln: movq %rsp, %rbp @@ -996,7 +793,7 @@ block0(v0: i128): } -function %f31(i128, i128) -> i128, i128 { +function %f26(i128, i128) -> i128, i128 { fn0 = %g(i128, i128) -> i128, i128 block0(v0: i128, v1: i128): v2, v3 = call fn0(v0, v1) @@ -1027,7 +824,7 @@ block0(v0: i128, v1: i128): } -function %f32(i128) -> i128 { +function %f27(i128) -> i128 { block0(v0: i128): v1 = clz.i128 v0 return v1 @@ -1056,7 +853,7 @@ block0(v0: i128): } -function %f33(i128) -> i128 { +function %f28(i128) -> i128 { block0(v0: i128): v1 = ctz.i128 v0 return v1 @@ -1080,3 +877,18 @@ block0(v0: i128): ; nextln: movq %rbp, %rsp ; nextln: popq %rbp ; nextln: ret + +function %f29(i8, i128) -> i8 { +block0(v0: i8, v1: i128): + v2 = ishl v0, v1 + return v2 +} + +; check: pushq %rbp +; nextln: movq %rsp, %rbp +; nextln: movq %rsi, %rcx +; nextln: shll %cl, %edi +; nextln: movq %rdi, %rax +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret diff --git a/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif index 37bc4667e7..991b2303cf 100644 --- a/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif +++ b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif @@ -2,105 +2,63 @@ test run target x86_64 feature "experimental_x64" -function %ishl1() -> b1 { -block0: - v0 = iconst.i64 0x01010101_01010101 - v1 = iconcat v0, v0 - v2 = iconst.i32 2 - v3 = ishl.i128 v1, v2 - v4 = iconst.i64 0x04040404_04040404 - v5 = iconcat v4, v4 - v6 = icmp eq v3, v5 - return v6 +function %ishl(i64, i64, i8) -> i64, i64 { +block0(v0: i64, v1: i64, v2: i8): + v3 = iconcat v0, v1 + v4 = ishl.i128 v3, v2 + v5, v6 = isplit v4 + return v5, v6 } -; run +; run: %ishl(0x01010101_01010101, 0x01010101_01010101, 2) == [0x04040404_04040404, 0x04040404_04040404] +; run: %ishl(0x01010101_01010101, 0x01010101_01010101, 9) == [0x02020202_02020200, 0x02020202_02020202] +; run: %ishl(0x01010101_01010101, 0xffffffff_ffffffff, 66) == [0x00000000_00000000, 0x04040404_04040404] +; run: %ishl(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101] +; run: %ishl(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101] -function %ishl2() -> b1 { -block0: - v0 = iconst.i64 0x01010101_01010101 - v1 = iconst.i64 0x01010101_01010101 - v2 = iconcat v0, v1 - v3 = iconst.i32 9 - v4 = ishl.i128 v2, v3 - v5 = iconst.i64 0x02020202_02020200 - v6 = iconst.i64 0x02020202_02020202 - v7 = iconcat v5, v6 - v8 = icmp eq v4, v7 - return v8 +function %ushr(i64, i64, i8) -> i64, i64 { +block0(v0: i64, v1: i64, v2: i8): + v3 = iconcat v0, v1 + v4 = ushr.i128 v3, v2 + v5, v6 = isplit v4 + return v5, v6 } -; run +; run: %ushr(0x01010101_01010101, 0x01010101_01010101, 2) == [0x40404040_40404040, 0x00404040_40404040] +; run: %ushr(0x01010101_01010101, 0x01010101_01010101, 66) == [0x00404040_40404040, 0x00000000_00000000] +; run: %ushr(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101] +; run: %ushr(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101] -function %ishl3() -> b1 { -block0: - v0 = iconst.i64 0x01010101_01010101 - v1 = iconst.i64 0xffffffff_ffffffff - v2 = iconcat v0, v1 - v3 = iconst.i32 66 - v4 = ishl.i128 v2, v3 - v5 = iconst.i64 0x00000000_00000000 - v6 = iconst.i64 0x04040404_04040404 - v7 = iconcat v5, v6 - v8 = icmp eq v4, v7 - return v8 +function %sshr(i64, i64, i8) -> i64, i64 { +block0(v0: i64, v1: i64, v2: i8): + v3 = iconcat v0, v1 + v4 = sshr.i128 v3, v2 + v5, v6 = isplit v4 + return v5, v6 } -; run +; run: %sshr(0x01010101_01010101, 0x81010101_01010101, 2) == [0x40404040_40404040, 0xe0404040_40404040] +; run: %sshr(0x12345678_9abcdef0, 0x80101010_10101010, 66) == [0xe0040404_04040404, 0xffffffff_ffffffff] +; run: %sshr(0x12345678_9abcdef0, 0x80101010_10101010, 0) == [0x12345678_9abcdef0, 0x80101010_10101010] +; run: %sshr(0x12345678_9abcdef0, 0x80101010_10101010, 128) == [0x12345678_9abcdef0, 0x80101010_10101010] -function %ushr1() -> b1 { -block0: - v0 = iconst.i64 0x01010101_01010101 - v1 = iconst.i64 0x01010101_01010101 - v2 = iconcat v0, v1 - v3 = iconst.i32 2 - v4 = ushr.i128 v2, v3 - v5 = iconst.i64 0x40404040_40404040 - v6 = iconst.i64 0x00404040_40404040 - v7 = iconcat v5, v6 - v8 = icmp eq v4, v7 - return v8 +function %rotl(i64, i64, i8) -> i64, i64 { +block0(v0: i64, v1: i64, v2: i8): + v3 = iconcat v0, v1 + v4 = rotl.i128 v3, v2 + v5, v6 = isplit v4 + return v5, v6 } -; run +; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 9) == [0x02020202_02020202, 0x02020202_02020202] +; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 73) == [0x02020202_02020202, 0x02020202_02020202] +; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101] +; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101] -function %ushr2() -> b1 { -block0: - v0 = iconst.i64 0x01010101_01010101 - v1 = iconst.i64 0x01010101_01010101 - v2 = iconcat v0, v1 - v3 = iconst.i32 66 - v4 = ushr.i128 v2, v3 - v5 = iconst.i64 0x00404040_40404040 - v6 = iconst.i64 0x00000000_00000000 - v7 = iconcat v5, v6 - v8 = icmp eq v4, v7 - return v8 +function %rotr(i64, i64, i8) -> i64, i64 { +block0(v0: i64, v1: i64, v2: i8): + v3 = iconcat v0, v1 + v4 = rotr.i128 v3, v2 + v5, v6 = isplit v4 + return v5, v6 } -; run - -function %sshr1() -> b1 { -block0: - v0 = iconst.i64 0x01010101_01010101 - v1 = iconst.i64 0x81010101_01010101 - v2 = iconcat v0, v1 - v3 = iconst.i32 2 - v4 = sshr.i128 v2, v3 - v5 = iconst.i64 0x40404040_40404040 - v6 = iconst.i64 0xe0404040_40404040 - v7 = iconcat v5, v6 - v8 = icmp eq v4, v7 - return v8 -} -; run - -function %sshr2() -> b1 { -block0: - v0 = iconst.i64 0x12345678_9abcdef0 - v1 = iconst.i64 0x80101010_10101010 - v2 = iconcat v0, v1 - v3 = iconst.i32 66 - v4 = sshr.i128 v2, v3 - v5 = iconst.i64 0xe0040404_04040404 - v6 = iconst.i64 0xffffffff_ffffffff - v7 = iconcat v5, v6 - v8 = icmp eq v4, v7 - return v8 -} -; run +; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 9) == [0x80808080_80808080, 0x80808080_80808080] +; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 73) == [0x80808080_80808080, 0x80808080_80808080] +; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101] +; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101]