diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 2a3ee88fcc..d99bf620c2 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -244,21 +244,70 @@ pub(crate) fn lower_insn_to_regs>( } Opcode::Imul => { - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); + let lhs = put_input_in_regs(ctx, inputs[0]); + let rhs = put_input_in_regs(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + + let rd = dst.regs()[0]; + let rn = lhs.regs()[0]; + let rm = rhs.regs()[0]; + let ty = ty.unwrap(); - if !ty.is_vector() { - let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64); - ctx.emit(Inst::AluRRRR { - alu_op, - rd, - rn, - rm, - ra: zero_reg(), - }); - } else { - if ty == I64X2 { + match ty { + I128 => { + assert_eq!(lhs.len(), 2); + assert_eq!(rhs.len(), 2); + assert_eq!(dst.len(), 2); + + // 128bit mul formula: + // dst_lo = lhs_lo * rhs_lo + // dst_hi = umulhi(lhs_lo, rhs_lo) + (lhs_lo * rhs_hi) + (lhs_hi * rhs_lo) + // + // We can convert the above formula into the following + // umulh dst_hi, lhs_lo, rhs_lo + // madd dst_hi, lhs_lo, rhs_hi, dst_hi + // madd dst_hi, lhs_hi, rhs_lo, dst_hi + // mul dst_lo, lhs_lo, rhs_lo + + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::UMulH, + rd: dst.regs()[1], + rn: lhs.regs()[0], + rm: rhs.regs()[0], + }); + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp3::MAdd64, + rd: dst.regs()[1], + rn: lhs.regs()[0], + rm: rhs.regs()[1], + ra: dst.regs()[1].to_reg(), + }); + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp3::MAdd64, + rd: dst.regs()[1], + rn: lhs.regs()[1], + rm: rhs.regs()[0], + ra: dst.regs()[1].to_reg(), + }); + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp3::MAdd64, + rd: dst.regs()[0], + rn: lhs.regs()[0], + rm: rhs.regs()[0], + ra: zero_reg(), + }); + } + ty if !ty.is_vector() => { + let alu_op = choose_32_64(ty, ALUOp3::MAdd32, ALUOp3::MAdd64); + ctx.emit(Inst::AluRRRR { + alu_op, + rd, + rn, + rm, + ra: zero_reg(), + }); + } + I64X2 => { let tmp1 = ctx.alloc_tmp(I64X2).only_reg().unwrap(); let tmp2 = ctx.alloc_tmp(I64X2).only_reg().unwrap(); @@ -363,7 +412,8 @@ pub(crate) fn lower_insn_to_regs>( rm: tmp1.to_reg(), size: VectorSize::Size32x2, }); - } else { + } + ty if ty.is_vector() => { ctx.emit(Inst::VecRRR { alu_op: VecALUOp::Mul, rd, @@ -372,6 +422,7 @@ pub(crate) fn lower_insn_to_regs>( size: VectorSize::from_ty(ty), }); } + _ => panic!("Unable to emit mul for {}", ty), } } diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif index 90fdd43e41..c847ae171e 100644 --- a/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif +++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic-run.clif @@ -11,16 +11,6 @@ block0: } ; run: %i128_const_0() == [0, 0] -; TODO: Blocked by https://github.com/bytecodealliance/wasmtime/issues/2906 -;function %i128_const_neg_1() -> i64, i64 { -;block0: -; v1 = iconst.i128 -1 -; v2, v3 = isplit v1 -; return v2, v3 -;} -; r-un: %i128_const_neg_1() == [0xffffffff_ffffffff, 0xffffffff_ffffffff] - - function %add_i128(i64, i64, i64, i64) -> i64, i64 { block0(v0: i64,v1: i64,v2: i64,v3: i64): v4 = iconcat v0, v1 @@ -38,6 +28,9 @@ block0(v0: i64,v1: i64,v2: i64,v3: i64): ; run: %add_i128(1, 0, -1, -1) == [0, 0] ; run: %add_i128(-1, 0, 1, 0) == [0, 1] +; run: %add_i128(0x01234567_89ABCDEF, 0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210) == [-1, -1] +; run: %add_i128(0x06060606_06060606, 0xA00A00A0_0A00A00A, 0x30303030_30303030, 0x0BB0BB0B_B0BB0BB0) == [0x36363636_36363636, 0xABBABBAB_BABBABBA] +; run: %add_i128(0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE, 0x1DCB1111_1DCB1111, 0x1DCB1111_1DCB1111) == [0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF] function %sub_i128(i64, i64, i64, i64) -> i64, i64 { block0(v0: i64,v1: i64,v2: i64,v3: i64): @@ -53,4 +46,33 @@ block0(v0: i64,v1: i64,v2: i64,v3: i64): ; run: %sub_i128(1, 0, 1, 0) == [0, 0] ; run: %sub_i128(1, 0, 0, 0) == [1, 0] ; run: %sub_i128(0, 0, 1, 0) == [-1, -1] -; run: %sub_i128(0, 0, -1, -1) == [1, 0] \ No newline at end of file +; run: %sub_i128(0, 0, -1, -1) == [1, 0] + +; run: %sub_i128(-1, -1, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210) == [0x01234567_89ABCDEF, 0x01234567_89ABCDEF] +; run: %sub_i128(0x36363636_36363636, 0xABBABBAB_BABBABBA, 0x30303030_30303030, 0x0BB0BB0B_B0BB0BB0) == [0x06060606_06060606, 0xA00A00A0_0A00A00A] +; run: %sub_i128(0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF, 0x1DCB1111_1DCB1111, 0x1DCB1111_1DCB1111) == [0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE] + + +function %mul_i128(i64, i64, i64, i64) -> i64, i64 { +block0(v0: i64,v1: i64,v2: i64,v3: i64): + v4 = iconcat v0, v1 + v5 = iconcat v2, v3 + + v6 = imul v4, v5 + + v7, v8 = isplit v6 + return v7, v8 +} +; run: %mul_i128(0, 0, 0, 0) == [0, 0] +; run: %mul_i128(1, 0, 1, 0) == [1, 0] +; run: %mul_i128(1, 0, 0, 0) == [0, 0] +; run: %mul_i128(0, 0, 1, 0) == [0, 0] +; run: %mul_i128(2, 0, 1, 0) == [2, 0] +; run: %mul_i128(2, 0, 2, 0) == [4, 0] +; run: %mul_i128(1, 0, -1, -1) == [-1, -1] +; run: %mul_i128(2, 0, -1, -1) == [-2, -1] + +; run: %mul_i128(0x01010101_01010101, 0x01010101_01010101, 13, 0) == [0x0D0D0D0D_0D0D0D0D, 0x0D0D0D0D_0D0D0D0D] +; run: %mul_i128(13, 0, 0x01010101_01010101, 0x01010101_01010101) == [0x0D0D0D0D_0D0D0D0D, 0x0D0D0D0D_0D0D0D0D] +; run: %mul_i128(0x00000000_01234567, 0x89ABCDEF_00000000, 0x00000000_FEDCBA98, 0x76543210_00000000) == [0x0121FA00_23E20B28, 0xE2946058_00000000] +; run: %mul_i128(0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF) == [0xDB6B1E48_19BA1112, 0x5ECD38B5_9D1C2B7E] diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif index edd25c0023..41d65ee293 100644 --- a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif +++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif @@ -453,3 +453,18 @@ block0(v0: i128, v1: i128): ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret +function %mul_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = imul v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: umulh x4, x0, x2 +; nextln: madd x3, x0, x3, x4 +; nextln: madd x1, x1, x2, x3 +; nextln: madd x0, x0, x2, xzr +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret +