From c38a5e8b62040915b7d60f6a55e8fb5fab81d859 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Wed, 26 May 2021 16:01:26 +0100 Subject: [PATCH] aarch64: Add basic i128 bit ops to the AArch64 backend Currently we just basically use a two instruction version of the same i64 ops. IMMLogic doesn't really support multiple register inputs, so its left as a TODO for future optimizations. --- .../codegen/src/isa/aarch64/lower_inst.rs | 54 ++++++- .../filetests/isa/aarch64/bitops.clif | 91 ++++++++++++ .../filetests/runtests/i128-bitops.clif | 135 ++++++++++++++++++ 3 files changed, 275 insertions(+), 5 deletions(-) create mode 100644 cranelift/filetests/filetests/runtests/i128-bitops.clif diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index c4384fa83a..a18d24deae 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -661,14 +661,31 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::Bnot => { - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let out_regs = get_output_reg(ctx, outputs[0]); let ty = ty.unwrap(); - if !ty.is_vector() { + if ty == I128 { + // TODO: We can merge this block with the one below once we support immlogic here + let in_regs = put_input_in_regs(ctx, inputs[0]); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::OrrNot64, + rd: out_regs.regs()[0], + rn: zero_reg(), + rm: in_regs.regs()[0], + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::OrrNot64, + rd: out_regs.regs()[1], + rn: zero_reg(), + rm: in_regs.regs()[1], + }); + } else if !ty.is_vector() { + let rd = out_regs.only_reg().unwrap(); let rm = put_input_in_rs_immlogic(ctx, inputs[0], NarrowValueMode::None); let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64); // NOT rd, rm ==> ORR_NOT rd, zero, rm ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm)); } else { + let rd = out_regs.only_reg().unwrap(); let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); ctx.emit(Inst::VecMisc { op: VecMisc2::Not, @@ -685,9 +702,36 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::BandNot | Opcode::BorNot | Opcode::BxorNot => { - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let out_regs = get_output_reg(ctx, outputs[0]); let ty = ty.unwrap(); - if !ty.is_vector() { + if ty == I128 { + // TODO: Support immlogic here + let lhs = put_input_in_regs(ctx, inputs[0]); + let rhs = put_input_in_regs(ctx, inputs[1]); + let alu_op = match op { + Opcode::Band => ALUOp::And64, + Opcode::Bor => ALUOp::Orr64, + Opcode::Bxor => ALUOp::Eor64, + Opcode::BandNot => ALUOp::AndNot64, + Opcode::BorNot => ALUOp::OrrNot64, + Opcode::BxorNot => ALUOp::EorNot64, + _ => unreachable!(), + }; + + ctx.emit(Inst::AluRRR { + alu_op, + rd: out_regs.regs()[0], + rn: lhs.regs()[0], + rm: rhs.regs()[0], + }); + ctx.emit(Inst::AluRRR { + alu_op, + rd: out_regs.regs()[1], + rn: lhs.regs()[1], + rm: rhs.regs()[1], + }); + } else if !ty.is_vector() { + let rd = out_regs.only_reg().unwrap(); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rm = put_input_in_rs_immlogic(ctx, inputs[1], NarrowValueMode::None); let alu_op = match op { @@ -711,7 +755,7 @@ pub(crate) fn lower_insn_to_regs>( let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let rd = out_regs.only_reg().unwrap(); ctx.emit(Inst::VecRRR { alu_op, diff --git a/cranelift/filetests/filetests/isa/aarch64/bitops.clif b/cranelift/filetests/filetests/isa/aarch64/bitops.clif index e651be167a..32e7fe7f04 100644 --- a/cranelift/filetests/filetests/isa/aarch64/bitops.clif +++ b/cranelift/filetests/filetests/isa/aarch64/bitops.clif @@ -293,3 +293,94 @@ block0: ; nextln: sbfx w0, w0, #0, #1 ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + +function %bnot_i128(i128) -> i128 { +block0(v0: i128): + v1 = bnot v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: orn x0, xzr, x0 +; nextln: orn x1, xzr, x1 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %band_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = band v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: and x0, x0, x2 +; nextln: and x1, x1, x3 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %bor_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = bor v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: orr x0, x0, x2 +; nextln: orr x1, x1, x3 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %bxor_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = bxor v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: eor x0, x0, x2 +; nextln: eor x1, x1, x3 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %band_not_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = band_not v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: bic x0, x0, x2 +; nextln: bic x1, x1, x3 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %bor_not_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = bor_not v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: orn x0, x0, x2 +; nextln: orn x1, x1, x3 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %bxor_not_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = bxor_not v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: eon x0, x0, x2 +; nextln: eon x1, x1, x3 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret \ No newline at end of file diff --git a/cranelift/filetests/filetests/runtests/i128-bitops.clif b/cranelift/filetests/filetests/runtests/i128-bitops.clif new file mode 100644 index 0000000000..5a1139f1d0 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/i128-bitops.clif @@ -0,0 +1,135 @@ +test run +target aarch64 +; target s390x TODO: Not yet implemented on s390x +; target x86_64 TODO: Not yet implemented on x86_64 + +; i128 tests +; TODO: Cleanup these tests when we have native support for i128 immediates in CLIF's parser +function %bnot_i128(i64, i64) -> i64, i64 { +block0(v0: i64,v1: i64): +v2 = iconcat v0, v1 + +v3 = bnot v2 + +v4, v5 = isplit v3 +return v4, v5 +} +; run: %bnot_i128(0, 0) == [-1, -1] +; run: %bnot_i128(-1, -1) == [0, 0] +; run: %bnot_i128(-1, 0) == [0, -1] + +; run: %bnot_i128(0x3F001111_3F001111, 0x21350000_21350000) == [0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF] + + +function %band_i128(i64, i64, i64, i64) -> i64, i64 { +block0(v0: i64,v1: i64,v2: i64,v3: i64): +v4 = iconcat v0, v1 +v5 = iconcat v2, v3 + +v6 = band v4, v5 + +v7, v8 = isplit v6 +return v7, v8 +} +; run: %band_i128(0, 0, 0, 0) == [0, 0] +; run: %band_i128(-1, -1, 0, 0) == [0, 0] +; run: %band_i128(-1, -1, -1, -1) == [-1, -1] +; run: %band_i128(-1, -1, 0, -1) == [0, -1] + +; run: %band_i128(0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210, 0x01234567_89ABCDEF) == [0, 0] +; run: %band_i128(0xF1FFFEFE_F1FFFEFE, 0xFEEEFFFF_FEEEFFFF, 0xCEFFEFEF_CEFFEFEF, 0xDFDBFFFF_DFDBFFFF) == [0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF] + + +function %bor_i128(i64, i64, i64, i64) -> i64, i64 { +block0(v0: i64,v1: i64,v2: i64,v3: i64): +v4 = iconcat v0, v1 +v5 = iconcat v2, v3 + +v6 = bor v4, v5 + +v7, v8 = isplit v6 +return v7, v8 +} +; run: %bor_i128(0, 0, 0, 0) == [0, 0] +; run: %bor_i128(-1, -1, 0, 0) == [-1, -1] +; run: %bor_i128(-1, -1, -1, -1) == [-1, -1] +; run: %bor_i128(0, 0, 0, -1) == [0, -1] + +; run: %bor_i128(0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210, 0x01234567_89ABCDEF) == [-1, -1] +; run: %bor_i128(0x80AAAAAA_80AAAAAA, 0x8A8AAAAA_8A8AAAAA, 0x40554444_40554444, 0x54405555_54405555) == [0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF] + + +function %bxor_i128(i64, i64, i64, i64) -> i64, i64 { +block0(v0: i64,v1: i64,v2: i64,v3: i64): +v4 = iconcat v0, v1 +v5 = iconcat v2, v3 + +v6 = bxor v4, v5 + +v7, v8 = isplit v6 +return v7, v8 +} +; run: %bxor_i128(0, 0, 0, 0) == [0, 0] +; run: %bxor_i128(-1, -1, 0, 0) == [-1, -1] +; run: %bxor_i128(-1, -1, -1, -1) == [0, 0] +; run: %bxor_i128(-1, -1, 0, -1) == [-1, 0] + +; run: %bxor_i128(0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210, 0x01234567_89ABCDEF) == [-1, -1] +; run: %bxor_i128(0x8FA50A64_8FA50A64, 0x9440A07D_9440A07D, 0x4F5AE48A_4F5AE48A, 0x4A8A5F82_4A8A5F82) == [0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF] + + +function %band_not_i128(i64, i64, i64, i64) -> i64, i64 { +block0(v0: i64,v1: i64,v2: i64,v3: i64): +v4 = iconcat v0, v1 +v5 = iconcat v2, v3 + +v6 = band_not v4, v5 + +v7, v8 = isplit v6 +return v7, v8 +} +; run: %band_not_i128(0, 0, 0, 0) == [0, 0] +; run: %band_not_i128(-1, -1, 0, 0) == [-1, -1] +; run: %band_not_i128(-1, -1, -1, -1) == [0, 0] +; run: %band_not_i128(-1, -1, 0, -1) == [-1, 0] + +; run: %band_not_i128(0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210, 0x01234567_89ABCDEF) == [0x01234567_89ABCDEF, 0xFEDCBA98_76543210] +; run: %band_not_i128(0xF1FFFEFE_F1FFFEFE, 0xFEEEFFFF_FEEEFFFF, 0x31001010_31001010, 0x20240000_20240000) == [0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF] + + +function %bor_not_i128(i64, i64, i64, i64) -> i64, i64 { +block0(v0: i64,v1: i64,v2: i64,v3: i64): +v4 = iconcat v0, v1 +v5 = iconcat v2, v3 + +v6 = bor_not v4, v5 + +v7, v8 = isplit v6 +return v7, v8 +} +; run: %bor_not_i128(0, 0, 0, 0) == [-1, -1] +; run: %bor_not_i128(-1, -1, 0, 0) == [-1, -1] +; run: %bor_not_i128(-1, -1, -1, -1) == [-1, -1] +; run: %bor_not_i128(-1, 0, 0, -1) == [-1, 0] + +; run: %bor_not_i128(0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210, 0x01234567_89ABCDEF) == [0x01234567_89ABCDEF, 0xFEDCBA98_76543210] +; run: %bor_not_i128(0x80AAAAAA_80AAAAAA, 0x8A8AAAAA_8A8AAAAA, 0xBFAABBBB_BFAABBBB, 0xABBFAAAA_ABBFAAAA) == [0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF] + + +function %bxor_not_i128(i64, i64, i64, i64) -> i64, i64 { +block0(v0: i64,v1: i64,v2: i64,v3: i64): +v4 = iconcat v0, v1 +v5 = iconcat v2, v3 + +v6 = bxor_not v4, v5 + +v7, v8 = isplit v6 +return v7, v8 +} +; run: %bxor_not_i128(0, 0, 0, 0) == [-1, -1] +; run: %bxor_not_i128(-1, -1, 0, 0) == [0, 0] +; run: %bxor_not_i128(-1, -1, -1, -1) == [-1, -1] +; run: %bxor_not_i128(-1, -1, 0, -1) == [0, -1] + +; run: %bxor_not_i128(0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210, 0x01234567_89ABCDEF) == [0, 0] +; run: %bxor_not_i128(0x8FA50A64_8FA50A64, 0x9440A07D_9440A07D, 0xB0A51B75_B0A51B75, 0xB575A07D_B575A07D) == [0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF]