diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 44da584b44..ee55f3394f 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -221,7 +221,7 @@ impl From<(Opcode, Type)> for BitOp { (Opcode::Clz, I64) => BitOp::Clz64, (Opcode::Cls, I32) => BitOp::Cls32, (Opcode::Cls, I64) => BitOp::Cls64, - _ => unreachable!("Called with non-bit op!"), + _ => unreachable!("Called with non-bit op!: {:?}", op_ty), } } } diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 4bb3c547e3..92760a81bb 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -1240,24 +1240,64 @@ fn lower_insn_to_regs>(ctx: &mut C, insn: IRInst) { } } - Opcode::Bitrev | Opcode::Clz | Opcode::Cls => { + Opcode::Bitrev | Opcode::Clz | Opcode::Cls | Opcode::Ctz => { let rd = output_to_reg(ctx, outputs[0]); - let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); - let op = BitOp::from((op, ty.unwrap())); - ctx.emit(Inst::BitRR { rd, rn, op }); - } + let needs_zext = match op { + Opcode::Bitrev | Opcode::Ctz => false, + Opcode::Clz | Opcode::Cls => true, + _ => unreachable!(), + }; + let ty = ty.unwrap(); + let narrow_mode = if needs_zext && ty_bits(ty) == 64 { + NarrowValueMode::ZeroExtend64 + } else if needs_zext { + NarrowValueMode::ZeroExtend32 + } else { + NarrowValueMode::None + }; + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + let op_ty = match ty { + I8 | I16 | I32 => I32, + I64 => I64, + _ => panic!("Unsupported type for Bitrev/Clz/Cls"), + }; + let bitop = match op { + Opcode::Clz | Opcode::Cls | Opcode::Bitrev => BitOp::from((op, op_ty)), + Opcode::Ctz => BitOp::from((Opcode::Bitrev, op_ty)), + _ => unreachable!(), + }; + ctx.emit(Inst::BitRR { rd, rn, op: bitop }); - Opcode::Ctz => { - let rd = output_to_reg(ctx, outputs[0]); - let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); - let op = BitOp::from((Opcode::Bitrev, ty.unwrap())); - ctx.emit(Inst::BitRR { rd, rn, op }); - let op = BitOp::from((Opcode::Clz, ty.unwrap())); - ctx.emit(Inst::BitRR { - rd, - rn: rd.to_reg(), - op, - }); + // Both bitrev and ctz use a bit-reverse (rbit) instruction; ctz to reduce the problem + // to a clz, and bitrev as the main operation. + if op == Opcode::Bitrev || op == Opcode::Ctz { + // Reversing an n-bit value (n < 32) with a 32-bit bitrev instruction will place + // the reversed result in the highest n bits, so we need to shift them down into + // place. + let right_shift = match ty { + I8 => Some(24), + I16 => Some(16), + I32 => None, + I64 => None, + _ => panic!("Unsupported type for Bitrev"), + }; + if let Some(s) = right_shift { + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsr32, + rd, + rn: rd.to_reg(), + immshift: ImmShift::maybe_from_u64(s).unwrap(), + }); + } + } + + if op == Opcode::Ctz { + ctx.emit(Inst::BitRR { + op: BitOp::from((Opcode::Clz, op_ty)), + rd, + rn: rd.to_reg(), + }); + } } Opcode::Popcnt => { @@ -1272,7 +1312,10 @@ fn lower_insn_to_regs>(ctx: &mut C, insn: IRInst) { // x >> 56 let ty = ty.unwrap(); let rd = output_to_reg(ctx, outputs[0]); - let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + // FIXME(#1537): zero-extend 8/16/32-bit operands only to 32 bits, + // and fix the sequence below to work properly for this. + let narrow_mode = NarrowValueMode::ZeroExtend64; + let rn = input_to_reg(ctx, inputs[0], narrow_mode); let tmp = ctx.tmp(RegClass::I64, I64); // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then diff --git a/cranelift/filetests/filetests/vcode/aarch64/bitops.clif b/cranelift/filetests/filetests/vcode/aarch64/bitops.clif index 8f5e81d322..f1f1a7dba3 100644 --- a/cranelift/filetests/filetests/vcode/aarch64/bitops.clif +++ b/cranelift/filetests/filetests/vcode/aarch64/bitops.clif @@ -1,6 +1,34 @@ test vcode target aarch64 +function %a(i8) -> i8 { +block0(v0: i8): + v1 = bitrev v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: rbit w0, w0 +; nextln: lsr w0, w0, #24 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %a(i16) -> i16 { +block0(v0: i16): + v1 = bitrev v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: rbit w0, w0 +; nextln: lsr w0, w0, #16 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + function %a(i32) -> i32 { block0(v0: i32): v1 = bitrev v0 @@ -27,6 +55,35 @@ block0(v0: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + +function %b(i8) -> i8 { +block0(v0: i8): + v1 = clz v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb w0, w0 +; nextln: clz w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %b(i16) -> i16 { +block0(v0: i16): + v1 = clz v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth w0, w0 +; nextln: clz w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + function %b(i32) -> i32 { block0(v0: i32): v1 = clz v0 @@ -53,6 +110,34 @@ block0(v0: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret +function %c(i8) -> i8 { +block0(v0: i8): + v1 = cls v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb w0, w0 +; nextln: cls w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %c(i16) -> i16 { +block0(v0: i16): + v1 = cls v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth w0, w0 +; nextln: cls w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + function %c(i32) -> i32 { block0(v0: i32): v1 = cls v0 @@ -79,6 +164,36 @@ block0(v0: i64): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret +function %d(i8) -> i8 { +block0(v0: i8): + v1 = ctz v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: rbit w0, w0 +; nextln: lsr w0, w0, #24 +; nextln: clz w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %d(i16) -> i16 { +block0(v0: i16): + v1 = ctz v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: rbit w0, w0 +; nextln: lsr w0, w0, #16 +; nextln: clz w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + function %d(i32) -> i32 { block0(v0: i32): v1 = ctz v0 @@ -140,6 +255,59 @@ block0(v0: i32): ; check: stp fp, lr, [sp, #-16]! ; nextln: mov fp, sp +; nextln: mov w0, w0 +; nextln: lsr w1, w0, #1 +; nextln: and x1, x1, #6148914691236517205 +; nextln: sub x1, x0, x1 +; nextln: and x0, x1, #3689348814741910323 +; nextln: lsr x1, x1, #2 +; nextln: and x1, x1, #3689348814741910323 +; nextln: add x0, x1, x0 +; nextln: add x0, x0, x0, LSR 4 +; nextln: and x0, x0, #1085102592571150095 +; nextln: add x0, x0, x0, LSL 8 +; nextln: add x0, x0, x0, LSL 16 +; nextln: add x0, x0, x0, LSL 32 +; nextln: lsr x0, x0, #56 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %d(i16) -> i16 { +block0(v0: i16): + v1 = popcnt v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth x0, w0 +; nextln: lsr w1, w0, #1 +; nextln: and x1, x1, #6148914691236517205 +; nextln: sub x1, x0, x1 +; nextln: and x0, x1, #3689348814741910323 +; nextln: lsr x1, x1, #2 +; nextln: and x1, x1, #3689348814741910323 +; nextln: add x0, x1, x0 +; nextln: add x0, x0, x0, LSR 4 +; nextln: and x0, x0, #1085102592571150095 +; nextln: add x0, x0, x0, LSL 8 +; nextln: add x0, x0, x0, LSL 16 +; nextln: add x0, x0, x0, LSL 32 +; nextln: lsr x0, x0, #56 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %d(i8) -> i8 { +block0(v0: i8): + v1 = popcnt v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb x0, w0 ; nextln: lsr w1, w0, #1 ; nextln: and x1, x1, #6148914691236517205 ; nextln: sub x1, x0, x1