cranelift: Port rotr lowering to ISLE on x64

2022-01-13 13:22:48 -08:00
parent 4120e40318
commit a41fdb0303
6 changed files with 422 additions and 678 deletions
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -877,316 +877,6 @@ fn emit_bitrev<C: LowerCtx<I = Inst>>(ctx: &mut C, src: Reg, dst: Writable<Reg>,
    ctx.emit(Inst::gen_move(dst, tmp0.to_reg(), types::I64));
 }

-fn emit_shl_i128<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    src: ValueRegs<Reg>,
-    dst: ValueRegs<Writable<Reg>>,
-    amt_src: Reg,
-) {
-    let src_lo = src.regs()[0];
-    let src_hi = src.regs()[1];
-    let dst_lo = dst.regs()[0];
-    let dst_hi = dst.regs()[1];
-
-    // mov tmp1, src_lo
-    // shl tmp1, amt_src
-    // mov tmp2, src_hi
-    // shl tmp2, amt_src
-    // mov amt, 64
-    // sub amt, amt_src
-    // mov tmp3, src_lo
-    // shr tmp3, amt
-    // xor dst_lo, dst_lo
-    // test amt_src, 127
-    // cmovz tmp3, dst_lo
-    // or tmp3, tmp2
-    // mov amt, amt_src
-    // and amt, 64
-    // cmovz dst_hi, tmp3
-    // cmovz dst_lo, tmp1
-    // cmovnz dst_hi, tmp1
-
-    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-    let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-    let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-
-    ctx.emit(Inst::gen_move(tmp1, src_lo, types::I64));
-    ctx.emit(Inst::gen_move(
-        Writable::from_reg(regs::rcx()),
-        amt_src,
-        types::I64,
-    ));
-    ctx.emit(Inst::shift_r(
-        OperandSize::Size64,
-        ShiftKind::ShiftLeft,
-        None,
-        tmp1,
-    ));
-
-    ctx.emit(Inst::gen_move(tmp2, src_hi, types::I64));
-    ctx.emit(Inst::gen_move(
-        Writable::from_reg(regs::rcx()),
-        amt_src,
-        types::I64,
-    ));
-    ctx.emit(Inst::shift_r(
-        OperandSize::Size64,
-        ShiftKind::ShiftLeft,
-        None,
-        tmp2,
-    ));
-
-    ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::Sub,
-        RegMemImm::reg(amt_src),
-        amt,
-    ));
-
-    ctx.emit(Inst::gen_move(tmp3, src_lo, types::I64));
-    ctx.emit(Inst::gen_move(
-        Writable::from_reg(regs::rcx()),
-        amt.to_reg(),
-        types::I64,
-    ));
-    ctx.emit(Inst::shift_r(
-        OperandSize::Size64,
-        ShiftKind::ShiftRightLogical,
-        None,
-        tmp3,
-    ));
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::Xor,
-        RegMemImm::reg(dst_lo.to_reg()),
-        dst_lo,
-    ));
-
-    ctx.emit(Inst::test_rmi_r(
-        OperandSize::Size64,
-        RegMemImm::imm(127),
-        amt_src,
-    ));
-    ctx.emit(Inst::cmove(
-        OperandSize::Size64,
-        CC::Z,
-        RegMem::reg(dst_lo.to_reg()),
-        tmp3,
-    ));
-
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::Or,
-        RegMemImm::reg(tmp2.to_reg()),
-        tmp3,
-    ));
-
-    // This isn't semantically necessary, but it keeps the
-    // register allocator happy, because it cannot otherwise
-    // infer that cmovz + cmovnz always defines dst_hi.
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::Xor,
-        RegMemImm::reg(dst_hi.to_reg()),
-        dst_hi,
-    ));
-
-    ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::And,
-        RegMemImm::imm(64),
-        amt,
-    ));
-    ctx.emit(Inst::cmove(
-        OperandSize::Size64,
-        CC::Z,
-        RegMem::reg(tmp3.to_reg()),
-        dst_hi,
-    ));
-    ctx.emit(Inst::cmove(
-        OperandSize::Size64,
-        CC::Z,
-        RegMem::reg(tmp1.to_reg()),
-        dst_lo,
-    ));
-    ctx.emit(Inst::cmove(
-        OperandSize::Size64,
-        CC::NZ,
-        RegMem::reg(tmp1.to_reg()),
-        dst_hi,
-    ));
-}
-
-fn emit_shr_i128<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    src: ValueRegs<Reg>,
-    dst: ValueRegs<Writable<Reg>>,
-    amt_src: Reg,
-    is_signed: bool,
-) {
-    let src_lo = src.regs()[0];
-    let src_hi = src.regs()[1];
-    let dst_lo = dst.regs()[0];
-    let dst_hi = dst.regs()[1];
-
-    // mov tmp1, src_hi
-    // {u,s}shr tmp1, amt_src
-    // mov tmp2, src_lo
-    // ushr tmp2, amt_src
-    // mov amt, 64
-    // sub amt, amt_src
-    // mov tmp3, src_hi
-    // shl tmp3, amt
-    // xor dst_lo, dst_lo
-    // test amt_src, 127
-    // cmovz tmp3, dst_lo
-    // or tmp3, tmp2
-    // if is_signed:
-    //   mov dst_hi, src_hi
-    //   sshr dst_hi, 63  // get the sign bit
-    // else:
-    //   xor dst_hi, dst_hi
-    // mov amt, amt_src
-    // and amt, 64
-    // cmovz dst_hi, tmp1
-    // cmovz dst_lo, tmp3
-    // cmovnz dst_lo, tmp1
-
-    let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-    let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-    let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-    let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-
-    let shift_kind = if is_signed {
-        ShiftKind::ShiftRightArithmetic
-    } else {
-        ShiftKind::ShiftRightLogical
-    };
-
-    ctx.emit(Inst::gen_move(tmp1, src_hi, types::I64));
-    ctx.emit(Inst::gen_move(
-        Writable::from_reg(regs::rcx()),
-        amt_src,
-        types::I64,
-    ));
-    ctx.emit(Inst::shift_r(OperandSize::Size64, shift_kind, None, tmp1));
-
-    ctx.emit(Inst::gen_move(tmp2, src_lo, types::I64));
-    ctx.emit(Inst::gen_move(
-        Writable::from_reg(regs::rcx()),
-        amt_src,
-        types::I64,
-    ));
-    // N.B.: right-shift of *lower* half is *always* unsigned (its MSB is not a sign bit).
-    ctx.emit(Inst::shift_r(
-        OperandSize::Size64,
-        ShiftKind::ShiftRightLogical,
-        None,
-        tmp2,
-    ));
-
-    ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::Sub,
-        RegMemImm::reg(amt_src),
-        amt,
-    ));
-
-    ctx.emit(Inst::gen_move(tmp3, src_hi, types::I64));
-    ctx.emit(Inst::gen_move(
-        Writable::from_reg(regs::rcx()),
-        amt.to_reg(),
-        types::I64,
-    ));
-    ctx.emit(Inst::shift_r(
-        OperandSize::Size64,
-        ShiftKind::ShiftLeft,
-        None,
-        tmp3,
-    ));
-
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::Xor,
-        RegMemImm::reg(dst_lo.to_reg()),
-        dst_lo,
-    ));
-    ctx.emit(Inst::test_rmi_r(
-        OperandSize::Size64,
-        RegMemImm::imm(127),
-        amt_src,
-    ));
-    ctx.emit(Inst::cmove(
-        OperandSize::Size64,
-        CC::Z,
-        RegMem::reg(dst_lo.to_reg()),
-        tmp3,
-    ));
-
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::Or,
-        RegMemImm::reg(tmp2.to_reg()),
-        tmp3,
-    ));
-
-    if is_signed {
-        ctx.emit(Inst::gen_move(dst_hi, src_hi, types::I64));
-        ctx.emit(Inst::shift_r(
-            OperandSize::Size64,
-            ShiftKind::ShiftRightArithmetic,
-            Some(63),
-            dst_hi,
-        ));
-    } else {
-        ctx.emit(Inst::alu_rmi_r(
-            OperandSize::Size64,
-            AluRmiROpcode::Xor,
-            RegMemImm::reg(dst_hi.to_reg()),
-            dst_hi,
-        ));
-    }
-    // This isn't semantically necessary, but it keeps the
-    // register allocator happy, because it cannot otherwise
-    // infer that cmovz + cmovnz always defines dst_lo.
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::Xor,
-        RegMemImm::reg(dst_lo.to_reg()),
-        dst_lo,
-    ));
-
-    ctx.emit(Inst::gen_move(amt, amt_src, types::I64));
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::And,
-        RegMemImm::imm(64),
-        amt,
-    ));
-    ctx.emit(Inst::cmove(
-        OperandSize::Size64,
-        CC::Z,
-        RegMem::reg(tmp1.to_reg()),
-        dst_hi,
-    ));
-    ctx.emit(Inst::cmove(
-        OperandSize::Size64,
-        CC::Z,
-        RegMem::reg(tmp3.to_reg()),
-        dst_lo,
-    ));
-    ctx.emit(Inst::cmove(
-        OperandSize::Size64,
-        CC::NZ,
-        RegMem::reg(tmp1.to_reg()),
-        dst_lo,
-    ));
-}
-
 fn make_libcall_sig<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    insn: IRInst,
@@ -1542,99 +1232,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::Ushr
        | Opcode::Sshr
        | Opcode::Ishl
-        | Opcode::Rotl => implemented_in_isle(ctx),
-
-        Opcode::Rotr => {
-            let dst_ty = ctx.output_ty(insn, 0);
-            debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty);
-
-            if !dst_ty.is_vector() && dst_ty.bits() <= 64 {
-                // Scalar shifts on x86 have various encodings:
-                // - shift by one bit, e.g. `SAL r/m8, 1` (not used here)
-                // - shift by an immediate amount, e.g. `SAL r/m8, imm8`
-                // - shift by a dynamic amount but only from the CL register, e.g. `SAL r/m8, CL`.
-                // This implementation uses the last two encoding methods.
-                let (size, lhs) = match dst_ty {
-                    types::I8 | types::I16 => match op {
-                        Opcode::Rotr => (
-                            OperandSize::from_ty(dst_ty),
-                            put_input_in_reg(ctx, inputs[0]),
-                        ),
-                        _ => unreachable!(),
-                    },
-                    types::I32 | types::I64 => (
-                        OperandSize::from_ty(dst_ty),
-                        put_input_in_reg(ctx, inputs[0]),
-                    ),
-                    _ => unreachable!("unhandled output type for shift/rotates: {}", dst_ty),
-                };
-
-                let (count, rhs) =
-                    if let Some(cst) = ctx.get_input_as_source_or_const(insn, 1).constant {
-                        // Mask count, according to Cranelift's semantics.
-                        let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
-                        (Some(cst), None)
-                    } else {
-                        // We can ignore upper registers if shift amount is multi-reg, because we
-                        // are taking the shift amount mod 2^(lhs_width) anyway.
-                        (None, Some(put_input_in_regs(ctx, inputs[1]).regs()[0]))
-                    };
-
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-                let shift_kind = match op {
-                    Opcode::Rotr => ShiftKind::RotateRight,
-                    _ => unreachable!(),
-                };
-
-                let w_rcx = Writable::from_reg(regs::rcx());
-                ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
-                if count.is_none() {
-                    ctx.emit(Inst::mov_r_r(OperandSize::Size64, rhs.unwrap(), w_rcx));
-                }
-                ctx.emit(Inst::shift_r(size, shift_kind, count, dst));
-            } else if dst_ty == types::I128 {
-                let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0];
-                let src = put_input_in_regs(ctx, inputs[0]);
-                let dst = get_output_reg(ctx, outputs[0]);
-
-                match op {
-                    Opcode::Rotr => {
-                        // (mov tmp, src)
-                        // (ushr.i128 tmp, amt)
-                        // (mov dst, src)
-                        // (shl.i128 dst, 128-amt)
-                        // (or dst, tmp)
-                        let tmp = ctx.alloc_tmp(types::I128);
-                        emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false);
-                        let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                        ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt));
-                        ctx.emit(Inst::alu_rmi_r(
-                            OperandSize::Size64,
-                            AluRmiROpcode::Sub,
-                            RegMemImm::reg(amt_src),
-                            inv_amt,
-                        ));
-                        emit_shl_i128(ctx, src, dst, inv_amt.to_reg());
-                        ctx.emit(Inst::alu_rmi_r(
-                            OperandSize::Size64,
-                            AluRmiROpcode::Or,
-                            RegMemImm::reg(tmp.regs()[0].to_reg()),
-                            dst.regs()[0],
-                        ));
-                        ctx.emit(Inst::alu_rmi_r(
-                            OperandSize::Size64,
-                            AluRmiROpcode::Or,
-                            RegMemImm::reg(tmp.regs()[1].to_reg()),
-                            dst.regs()[1],
-                        ));
-                    }
-                    _ => unreachable!(),
-                }
-            } else {
-                implemented_in_isle(ctx);
-            }
-        }
+        | Opcode::Rotl
+        | Opcode::Rotr => implemented_in_isle(ctx),

        Opcode::Ineg => {
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();