cranelift: Port rotr lowering to ISLE on x64

2022-01-13 13:22:48 -08:00
parent 4120e40318
commit a41fdb0303
6 changed files with 422 additions and 678 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -811,6 +811,12 @@
 (rule (m_rotl ty src1 src2)
      (shift_r ty (ShiftKind.RotateLeft) src1 src2))
 ;; Helper for creating `rotr` instructions (prefixed with "m_", short for "mach
 ;; inst", to disambiguate this from clif's `rotr`).
 (decl m_rotr (Type Reg Imm8Reg) Reg)
 (rule (m_rotr ty src1 src2)
      (shift_r ty (ShiftKind.RotateRight) src1 src2))
 ;; Helper for creating `shl` instructions.
 (decl shl (Type Reg Imm8Reg) Reg)
 (rule (shl ty src1 src2)
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -865,6 +865,46 @@
        (or_i128 (shl_i128 src_ amt_)
                 (shr_i128 src_ (sub $I64 (imm $I64 128) (RegMemImm.Reg amt_))))))
 ;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; `i16` and `i8`: we need to extend the shift amount, or mask the
 ;; constant.
 (rule (lower (has_type (ty_8_or_16 ty) (rotr src amt)))
      (let ((amt_ Reg (extend_to_reg amt $I32 (ExtendKind.Zero))))
        (value_reg (m_rotr ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
 (rule (lower (has_type (ty_8_or_16 ty)
                       (rotr src (u64_from_iconst amt))))
      (value_reg (m_rotr ty
                         (put_in_reg src)
                         (const_to_type_masked_imm8 amt ty))))
 ;; `i64` and `i32`: we can rely on x86's rotate-amount masking since
 ;;  we operate on the whole register.
 (rule (lower (has_type (ty_32_or_64 ty) (rotr src amt)))
      ;; NB: Only the low bits of `amt` matter since we logically mask the
      ;; shift amount to the value's bit width.
      (let ((amt_ Reg (lo_reg amt)))
        (value_reg (m_rotr ty (put_in_reg src) (Imm8Reg.Reg amt_)))))
 (rule (lower (has_type (ty_32_or_64 ty)
                       (rotr src (u64_from_iconst amt))))
      (value_reg (m_rotr ty
                         (put_in_reg src)
                         (const_to_type_masked_imm8 amt ty))))
 ;; `i128`.
 (rule (lower (has_type $I128 (rotr src amt)))
      (let ((src_ ValueRegs (put_in_regs src))
            ;; NB: Only the low bits of `amt` matter since we logically mask the
            ;; rotation amount to the value's bit width.
            (amt_ Reg (lo_reg amt)))
        (or_i128 (shr_i128 src_ amt_)
                 (shl_i128 src_ (sub $I64 (imm $I64 128) (RegMemImm.Reg amt_))))))
 ;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (multi_lane 8 16)
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -877,316 +877,6 @@ fn emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>,
    ctx.emit(Inst::gen_move(dst, tmp0.to_reg(), types::I64));
 }
 fn emit_shl_i128<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    src: ValueRegs<Reg>,
    dst: ValueRegs<Writable<Reg>>,
    amt_src: Reg,
 ) {
    let src_lo = src.regs()[0];
    let src_hi = src.regs()[1];
    let dst_lo = dst.regs()[0];
    let dst_hi = dst.regs()[1];
    // mov tmp1, src_lo
    // shl tmp1, amt_src
    // mov tmp2, src_hi
    // shl tmp2, amt_src
    // mov amt, 64
    // sub amt, amt_src
    // mov tmp3, src_lo
    // shr tmp3, amt
    // xor dst_lo, dst_lo
    // test amt_src, 127
    // cmovz tmp3, dst_lo
    // or tmp3, tmp2
    // mov amt, amt_src
    // and amt, 64
    // cmovz dst_hi, tmp3
    // cmovz dst_lo, tmp1
    // cmovnz dst_hi, tmp1
    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
    let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
    let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
    ctx.emit(Inst::gen_move(tmp1, src_lo, types::I64));
    ctx.emit(Inst::gen_move(
        Writable::from_reg(regs::rcx()),
        amt_src,
        types::I64,
    ));
    ctx.emit(Inst::shift_r(
        OperandSize::Size64,
        ShiftKind::ShiftLeft,
        None,
        tmp1,
    ));
    ctx.emit(Inst::gen_move(tmp2, src_hi, types::I64));
    ctx.emit(Inst::gen_move(
        Writable::from_reg(regs::rcx()),
        amt_src,
        types::I64,
    ));
    ctx.emit(Inst::shift_r(
        OperandSize::Size64,
        ShiftKind::ShiftLeft,
        None,
        tmp2,
    ));
    ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::Sub,
        RegMemImm::reg(amt_src),
        amt,
    ));
    ctx.emit(Inst::gen_move(tmp3, src_lo, types::I64));
    ctx.emit(Inst::gen_move(
        Writable::from_reg(regs::rcx()),
        amt.to_reg(),
        types::I64,
    ));
    ctx.emit(Inst::shift_r(
        OperandSize::Size64,
        ShiftKind::ShiftRightLogical,
        None,
        tmp3,
    ));
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::Xor,
        RegMemImm::reg(dst_lo.to_reg()),
        dst_lo,
    ));
    ctx.emit(Inst::test_rmi_r(
        OperandSize::Size64,
        RegMemImm::imm(127),
        amt_src,
    ));
    ctx.emit(Inst::cmove(
        OperandSize::Size64,
        CC::Z,
        RegMem::reg(dst_lo.to_reg()),
        tmp3,
    ));
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::Or,
        RegMemImm::reg(tmp2.to_reg()),
        tmp3,
    ));
    // This isn't semantically necessary, but it keeps the
    // register allocator happy, because it cannot otherwise
    // infer that cmovz + cmovnz always defines dst_hi.
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::Xor,
        RegMemImm::reg(dst_hi.to_reg()),
        dst_hi,
    ));
    ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::And,
        RegMemImm::imm(64),
        amt,
    ));
    ctx.emit(Inst::cmove(
        OperandSize::Size64,
        CC::Z,
        RegMem::reg(tmp3.to_reg()),
        dst_hi,
    ));
    ctx.emit(Inst::cmove(
        OperandSize::Size64,
        CC::Z,
        RegMem::reg(tmp1.to_reg()),
        dst_lo,
    ));
    ctx.emit(Inst::cmove(
        OperandSize::Size64,
        CC::NZ,
        RegMem::reg(tmp1.to_reg()),
        dst_hi,
    ));
 }
 fn emit_shr_i128<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    src: ValueRegs<Reg>,
    dst: ValueRegs<Writable<Reg>>,
    amt_src: Reg,
    is_signed: bool,
 ) {
    let src_lo = src.regs()[0];
    let src_hi = src.regs()[1];
    let dst_lo = dst.regs()[0];
    let dst_hi = dst.regs()[1];
    // mov tmp1, src_hi
    // {u,s}shr tmp1, amt_src
    // mov tmp2, src_lo
    // ushr tmp2, amt_src
    // mov amt, 64
    // sub amt, amt_src
    // mov tmp3, src_hi
    // shl tmp3, amt
    // xor dst_lo, dst_lo
    // test amt_src, 127
    // cmovz tmp3, dst_lo
    // or tmp3, tmp2
    // if is_signed:
    //   mov dst_hi, src_hi
    //   sshr dst_hi, 63  // get the sign bit
    // else:
    //   xor dst_hi, dst_hi
    // mov amt, amt_src
    // and amt, 64
    // cmovz dst_hi, tmp1
    // cmovz dst_lo, tmp3
    // cmovnz dst_lo, tmp1
    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
    let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
    let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
    let shift_kind = if is_signed {
        ShiftKind::ShiftRightArithmetic
    } else {
        ShiftKind::ShiftRightLogical
    };
    ctx.emit(Inst::gen_move(tmp1, src_hi, types::I64));
    ctx.emit(Inst::gen_move(
        Writable::from_reg(regs::rcx()),
        amt_src,
        types::I64,
    ));
    ctx.emit(Inst::shift_r(OperandSize::Size64, shift_kind, None, tmp1));
    ctx.emit(Inst::gen_move(tmp2, src_lo, types::I64));
    ctx.emit(Inst::gen_move(
        Writable::from_reg(regs::rcx()),
        amt_src,
        types::I64,
    ));
    // N.B.: right-shift of *lower* half is *always* unsigned (its MSB is not a sign bit).
    ctx.emit(Inst::shift_r(
        OperandSize::Size64,
        ShiftKind::ShiftRightLogical,
        None,
        tmp2,
    ));
    ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::Sub,
        RegMemImm::reg(amt_src),
        amt,
    ));
    ctx.emit(Inst::gen_move(tmp3, src_hi, types::I64));
    ctx.emit(Inst::gen_move(
        Writable::from_reg(regs::rcx()),
        amt.to_reg(),
        types::I64,
    ));
    ctx.emit(Inst::shift_r(
        OperandSize::Size64,
        ShiftKind::ShiftLeft,
        None,
        tmp3,
    ));
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::Xor,
        RegMemImm::reg(dst_lo.to_reg()),
        dst_lo,
    ));
    ctx.emit(Inst::test_rmi_r(
        OperandSize::Size64,
        RegMemImm::imm(127),
        amt_src,
    ));
    ctx.emit(Inst::cmove(
        OperandSize::Size64,
        CC::Z,
        RegMem::reg(dst_lo.to_reg()),
        tmp3,
    ));
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::Or,
        RegMemImm::reg(tmp2.to_reg()),
        tmp3,
    ));
    if is_signed {
        ctx.emit(Inst::gen_move(dst_hi, src_hi, types::I64));
        ctx.emit(Inst::shift_r(
            OperandSize::Size64,
            ShiftKind::ShiftRightArithmetic,
            Some(63),
            dst_hi,
        ));
    } else {
        ctx.emit(Inst::alu_rmi_r(
            OperandSize::Size64,
            AluRmiROpcode::Xor,
            RegMemImm::reg(dst_hi.to_reg()),
            dst_hi,
        ));
    }
    // This isn't semantically necessary, but it keeps the
    // register allocator happy, because it cannot otherwise
    // infer that cmovz + cmovnz always defines dst_lo.
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::Xor,
        RegMemImm::reg(dst_lo.to_reg()),
        dst_lo,
    ));
    ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
    ctx.emit(Inst::alu_rmi_r(
        OperandSize::Size64,
        AluRmiROpcode::And,
        RegMemImm::imm(64),
        amt,
    ));
    ctx.emit(Inst::cmove(
        OperandSize::Size64,
        CC::Z,
        RegMem::reg(tmp1.to_reg()),
        dst_hi,
    ));
    ctx.emit(Inst::cmove(
        OperandSize::Size64,
        CC::Z,
        RegMem::reg(tmp3.to_reg()),
        dst_lo,
    ));
    ctx.emit(Inst::cmove(
        OperandSize::Size64,
        CC::NZ,
        RegMem::reg(tmp1.to_reg()),
        dst_lo,
    ));
 }
 fn make_libcall_sig<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    insn: IRInst,
@@ -1542,99 +1232,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::Ushr
        | Opcode::Sshr
        | Opcode::Ishl
-        | Opcode::Rotl => implemented_in_isle(ctx),
+        | Opcode::Rotl
-
+        | Opcode::Rotr => implemented_in_isle(ctx),
        Opcode::Rotr => {
            let dst_ty = ctx.output_ty(insn, 0);
            debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
            if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
                // Scalar shifts on x86 have various encodings:
                // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
                // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
                // - shift by a dynamic amount but only from the CL register, e.g. `SAL r/m8, CL`.
                // This implementation uses the last two encoding methods.
                let (size, lhs) = match dst_ty {
                    types::I8 | types::I16 => match op {
                        Opcode::Rotr => (
                            OperandSize::from_ty(dst_ty),
                            put_input_in_reg(ctx, inputs[0]),
                        ),
                        _ => unreachable!(),
                    },
                    types::I32 | types::I64 => (
                        OperandSize::from_ty(dst_ty),
                        put_input_in_reg(ctx, inputs[0]),
                    ),
                    _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
                };
                let (count, rhs) =
                    if let Some(cst) = ctx.get_input_as_source_or_const(insn, 1).constant {
                        // Mask count, according to Cranelift's semantics.
                        let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
                        (Some(cst), None)
                    } else {
                        // We can ignore upper registers if shift amount is multi-reg, because we
                        // are taking the shift amount mod 2^(lhs_width) anyway.
                        (None, Some(put_input_in_regs(ctx, inputs[1]).regs()[0]))
                    };
                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
                let shift_kind = match op {
                    Opcode::Rotr => ShiftKind::RotateRight,
                    _ => unreachable!(),
                };
                let w_rcx = Writable::from_reg(regs::rcx());
                ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
                if count.is_none() {
                    ctx.emit(Inst::mov_r_r(OperandSize::Size64, rhs.unwrap(), w_rcx));
                }
                ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
            } else if dst_ty == types::I128 {
                let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
                let src = put_input_in_regs(ctx, inputs[0]);
                let dst = get_output_reg(ctx, outputs[0]);
                match op {
                    Opcode::Rotr => {
                        // (mov tmp, src)
                        // (ushr.i128 tmp, amt)
                        // (mov dst, src)
                        // (shl.i128 dst, 128-amt)
                        // (or dst, tmp)
                        let tmp = ctx.alloc_tmp(types::I128);
                        emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
                        let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
                        ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
                        ctx.emit(Inst::alu_rmi_r(
                            OperandSize::Size64,
                            AluRmiROpcode::Sub,
                            RegMemImm::reg(amt_src),
                            inv_amt,
                        ));
                        emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
                        ctx.emit(Inst::alu_rmi_r(
                            OperandSize::Size64,
                            AluRmiROpcode::Or,
                            RegMemImm::reg(tmp.regs()[0].to_reg()),
                            dst.regs()[0],
                        ));
                        ctx.emit(Inst::alu_rmi_r(
                            OperandSize::Size64,
                            AluRmiROpcode::Or,
                            RegMemImm::reg(tmp.regs()[1].to_reg()),
                            dst.regs()[1],
                        ));
                    }
                    _ => unreachable!(),
                }
            } else {
                implemented_in_isle(ctx);
            }
        }
        Opcode::Ineg => {
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
 src/prelude.isle 7b911d3b894ae17
-src/isa/x64/inst.isle 41304d8ef6f7d816
+src/isa/x64/inst.isle 54ffef8c4f373807
-src/isa/x64/lower.isle 4689585f55f41438
+src/isa/x64/lower.isle 28de5d6bf49c8471
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -1270,56 +1270,52 @@ block0(v0: i128, v1: i128):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 50)
+;   (instruction range: 0 .. 46)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   movq    %rsi, %rax
+;   Inst 2:   movq    %rdi, %rax
-;   Inst 3:   movq    %rax, %r9
+;   Inst 3:   movq    %rdx, %rcx
-;   Inst 4:   movq    %rdx, %rcx
+;   Inst 4:   shrq    %cl, %rax
-;   Inst 5:   shrq    %cl, %r9
+;   Inst 5:   movq    %rsi, %r8
-;   Inst 6:   movq    %rdi, %rsi
+;   Inst 6:   movq    %rdx, %rcx
-;   Inst 7:   movq    %rdx, %rcx
+;   Inst 7:   shrq    %cl, %r8
-;   Inst 8:   shrq    %cl, %rsi
+;   Inst 8:   movl    $64, %ecx
-;   Inst 9:   movl    $64, %ecx
+;   Inst 9:   subq    %rdx, %rcx
-;   Inst 10:   subq    %rdx, %rcx
+;   Inst 10:   movq    %rsi, %r9
-;   Inst 11:   movq    %rax, %r10
+;   Inst 11:   shlq    %cl, %r9
-;   Inst 12:   shlq    %cl, %r10
+;   Inst 12:   xorq    %rcx, %rcx
-;   Inst 13:   xorq    %rcx, %rcx
+;   Inst 13:   testq   $127, %rdx
-;   Inst 14:   testq   $127, %rdx
+;   Inst 14:   cmovzq  %rcx, %r9
-;   Inst 15:   cmovzq  %rcx, %r10
+;   Inst 15:   movq    %r9, %rcx
-;   Inst 16:   orq     %rsi, %r10
+;   Inst 16:   orq     %rax, %rcx
-;   Inst 17:   xorq    %rsi, %rsi
+;   Inst 17:   xorq    %rax, %rax
-;   Inst 18:   xorq    %r8, %r8
+;   Inst 18:   testq   $64, %rdx
-;   Inst 19:   movq    %rdx, %rcx
+;   Inst 19:   cmovzq  %r8, %rax
-;   Inst 20:   andq    $64, %rcx
+;   Inst 20:   cmovzq  %rcx, %r8
-;   Inst 21:   cmovzq  %r9, %rsi
+;   Inst 21:   movl    $128, %r9d
-;   Inst 22:   cmovzq  %r10, %r8
+;   Inst 22:   subq    %rdx, %r9
-;   Inst 23:   cmovnzq %r9, %r8
+;   Inst 23:   movq    %rdi, %rdx
-;   Inst 24:   movl    $128, %r9d
+;   Inst 24:   movq    %r9, %rcx
-;   Inst 25:   subq    %rdx, %r9
+;   Inst 25:   shlq    %cl, %rdx
-;   Inst 26:   movq    %rdi, %rdx
+;   Inst 26:   movq    %r9, %rcx
-;   Inst 27:   movq    %r9, %rcx
+;   Inst 27:   shlq    %cl, %rsi
-;   Inst 28:   shlq    %cl, %rdx
+;   Inst 28:   movl    $64, %ecx
-;   Inst 29:   movq    %r9, %rcx
+;   Inst 29:   subq    %r9, %rcx
-;   Inst 30:   shlq    %cl, %rax
+;   Inst 30:   shrq    %cl, %rdi
-;   Inst 31:   movl    $64, %ecx
+;   Inst 31:   xorq    %rcx, %rcx
-;   Inst 32:   subq    %r9, %rcx
+;   Inst 32:   testq   $127, %r9
-;   Inst 33:   shrq    %cl, %rdi
+;   Inst 33:   cmovzq  %rcx, %rdi
-;   Inst 34:   xorq    %rcx, %rcx
+;   Inst 34:   orq     %rsi, %rdi
-;   Inst 35:   testq   $127, %r9
+;   Inst 35:   testq   $64, %r9
-;   Inst 36:   cmovzq  %rcx, %rdi
+;   Inst 36:   movq    %rdx, %rsi
-;   Inst 37:   orq     %rax, %rdi
+;   Inst 37:   cmovzq  %rdi, %rsi
-;   Inst 38:   xorq    %rax, %rax
+;   Inst 38:   cmovzq  %rdx, %rcx
-;   Inst 39:   andq    $64, %r9
+;   Inst 39:   orq     %rcx, %r8
-;   Inst 40:   cmovzq  %rdi, %rax
+;   Inst 40:   orq     %rsi, %rax
-;   Inst 41:   cmovzq  %rdx, %rcx
+;   Inst 41:   movq    %rax, %rdx
-;   Inst 42:   cmovnzq %rdx, %rax
+;   Inst 42:   movq    %r8, %rax
-;   Inst 43:   orq     %r8, %rcx
+;   Inst 43:   movq    %rbp, %rsp
-;   Inst 44:   orq     %rsi, %rax
+;   Inst 44:   popq    %rbp
-;   Inst 45:   movq    %rax, %rdx
+;   Inst 45:   ret
 ;   Inst 46:   movq    %rcx, %rax
 ;   Inst 47:   movq    %rbp, %rsp
 ;   Inst 48:   popq    %rbp
 ;   Inst 49:   ret
 ; }}