x64: move multiplication lowering

Since the lowering of `imul` complicated the other ALU operations it was
matched with and since future commits will alter the multiplication
lowering further, this change moves the `imul` lowering to its own match
block.
This commit is contained in:
Andrew Brown
2021-05-10 14:50:41 -07:00
parent fa1faf5d22
commit c982d2be65

View File

@@ -1511,7 +1511,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Isub | Opcode::Isub
| Opcode::SsubSat | Opcode::SsubSat
| Opcode::UsubSat | Opcode::UsubSat
| Opcode::Imul
| Opcode::AvgRound | Opcode::AvgRound
| Opcode::Band | Opcode::Band
| Opcode::Bor | Opcode::Bor
@@ -1553,13 +1552,121 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
types::I16X8 => SseOpcode::Psubusw, types::I16X8 => SseOpcode::Psubusw,
_ => panic!("Unsupported type for packed usub_sat instruction: {}", ty), _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty),
}, },
Opcode::Imul => match ty { Opcode::AvgRound => match ty {
types::I16X8 => SseOpcode::Pmullw, types::I8X16 => SseOpcode::Pavgb,
types::I32X4 => SseOpcode::Pmulld, types::I16X8 => SseOpcode::Pavgw,
types::I64X2 => { _ => panic!("Unsupported type for packed avg_round instruction: {}", ty),
// Note for I64X2 we describe a lane A as being composed of a },
// 32-bit upper half "Ah" and a 32-bit lower half "Al". Opcode::Band => match ty {
// The 32-bit long hand multiplication can then be written as: types::F32X4 => SseOpcode::Andps,
types::F64X2 => SseOpcode::Andpd,
_ => SseOpcode::Pand,
},
Opcode::Bor => match ty {
types::F32X4 => SseOpcode::Orps,
types::F64X2 => SseOpcode::Orpd,
_ => SseOpcode::Por,
},
Opcode::Bxor => match ty {
types::F32X4 => SseOpcode::Xorps,
types::F64X2 => SseOpcode::Xorpd,
_ => SseOpcode::Pxor,
},
_ => panic!("Unsupported packed instruction: {}", op),
};
let lhs = put_input_in_reg(ctx, inputs[0]);
let rhs = input_to_reg_mem(ctx, inputs[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
// Move the `lhs` to the same register as `dst`.
ctx.emit(Inst::gen_move(dst, lhs, ty));
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
} else if ty == types::I128 || ty == types::B128 {
let alu_ops = match op {
Opcode::Iadd => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
Opcode::Isub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
Opcode::Band => (AluRmiROpcode::And, AluRmiROpcode::And),
Opcode::Bor => (AluRmiROpcode::Or, AluRmiROpcode::Or),
Opcode::Bxor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
_ => panic!("Unsupported opcode with 128-bit integers: {:?}", op),
};
let lhs = put_input_in_regs(ctx, inputs[0]);
let rhs = put_input_in_regs(ctx, inputs[1]);
let dst = get_output_reg(ctx, outputs[0]);
assert_eq!(lhs.len(), 2);
assert_eq!(rhs.len(), 2);
assert_eq!(dst.len(), 2);
// For add, sub, and, or, xor: just do ops on lower then upper
// half. Carry-flag propagation is implicit (add/adc, sub/sbb).
ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
alu_ops.0,
RegMemImm::reg(rhs.regs()[0]),
dst.regs()[0],
));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
alu_ops.1,
RegMemImm::reg(rhs.regs()[1]),
dst.regs()[1],
));
} else {
let size = if ty == types::I64 {
OperandSize::Size64
} else {
OperandSize::Size32
};
let alu_op = match op {
Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add,
Opcode::Isub => AluRmiROpcode::Sub,
Opcode::Band => AluRmiROpcode::And,
Opcode::Bor => AluRmiROpcode::Or,
Opcode::Bxor => AluRmiROpcode::Xor,
_ => unreachable!(),
};
let (lhs, rhs) = match op {
Opcode::Iadd
| Opcode::IaddIfcout
| Opcode::Band
| Opcode::Bor
| Opcode::Bxor => {
// For commutative operations, try to commute operands if one is an
// immediate or direct memory reference. Do so by converting LHS to RMI; if
// reg, then always convert RHS to RMI; else, use LHS as RMI and convert
// RHS to reg.
let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
if let RegMemImm::Reg { reg: lhs_reg } = lhs {
let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
(lhs_reg, rhs)
} else {
let rhs_reg = put_input_in_reg(ctx, inputs[1]);
(rhs_reg, lhs)
}
}
Opcode::Isub => (
put_input_in_reg(ctx, inputs[0]),
input_to_reg_mem_imm(ctx, inputs[1]),
),
_ => unreachable!(),
};
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
ctx.emit(Inst::alu_rmi_r(size, alu_op, rhs, dst));
}
}
Opcode::Imul => {
let ty = ty.unwrap();
if ty == types::I64X2 {
// For I64X2 multiplication we describe a lane A as being
// composed of a 32-bit upper half "Ah" and a 32-bit lower half
// "Al". The 32-bit long hand multiplication can then be written
// as:
// Ah Al // Ah Al
// * Bh Bl // * Bh Bl
// ----- // -----
@@ -1570,12 +1677,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// So for each lane we will compute: // So for each lane we will compute:
// A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 // A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
// //
// Note, the algorithm will use pmuldq which operates directly on // Note, the algorithm will use pmuldq which operates directly
// the lower 32-bit (Al or Bl) of a lane and writes the result // on the lower 32-bit (Al or Bl) of a lane and writes the
// to the full 64-bits of the lane of the destination. For this // result to the full 64-bits of the lane of the destination.
// reason we don't need shifts to isolate the lower 32-bits, however // For this reason we don't need shifts to isolate the lower
// we will need to use shifts to isolate the high 32-bits when doing // 32-bits, however, we will need to use shifts to isolate the
// calculations, i.e. Ah == A >> 32 // high 32-bits when doing calculations, i.e., Ah == A >> 32.
// //
// The full sequence then is as follows: // The full sequence then is as follows:
// A' = A // A' = A
@@ -1655,31 +1762,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
rhs_1, rhs_1,
)); ));
ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty)); ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
return Ok(()); } else if ty.lane_count() > 1 {
} // Emit single instruction lowerings for the remaining vector
// multiplications.
let sse_op = match ty {
types::I16X8 => SseOpcode::Pmullw,
types::I32X4 => SseOpcode::Pmulld,
_ => panic!("Unsupported type for packed imul instruction: {}", ty), _ => panic!("Unsupported type for packed imul instruction: {}", ty),
},
Opcode::AvgRound => match ty {
types::I8X16 => SseOpcode::Pavgb,
types::I16X8 => SseOpcode::Pavgw,
_ => panic!("Unsupported type for packed avg_round instruction: {}", ty),
},
Opcode::Band => match ty {
types::F32X4 => SseOpcode::Andps,
types::F64X2 => SseOpcode::Andpd,
_ => SseOpcode::Pand,
},
Opcode::Bor => match ty {
types::F32X4 => SseOpcode::Orps,
types::F64X2 => SseOpcode::Orpd,
_ => SseOpcode::Por,
},
Opcode::Bxor => match ty {
types::F32X4 => SseOpcode::Xorps,
types::F64X2 => SseOpcode::Xorpd,
_ => SseOpcode::Pxor,
},
_ => panic!("Unsupported packed instruction: {}", op),
}; };
let lhs = put_input_in_reg(ctx, inputs[0]); let lhs = put_input_in_reg(ctx, inputs[0]);
let rhs = input_to_reg_mem(ctx, inputs[1]); let rhs = input_to_reg_mem(ctx, inputs[1]);
@@ -1689,16 +1778,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::gen_move(dst, lhs, ty)); ctx.emit(Inst::gen_move(dst, lhs, ty));
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
} else if ty == types::I128 || ty == types::B128 { } else if ty == types::I128 || ty == types::B128 {
let alu_ops = match op { // Handle 128-bit multiplications.
Opcode::Iadd => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
Opcode::Isub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
// multiply handled specially below
Opcode::Imul => (AluRmiROpcode::Mul, AluRmiROpcode::Mul),
Opcode::Band => (AluRmiROpcode::And, AluRmiROpcode::And),
Opcode::Bor => (AluRmiROpcode::Or, AluRmiROpcode::Or),
Opcode::Bxor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
_ => panic!("Unsupported opcode with 128-bit integers: {:?}", op),
};
let lhs = put_input_in_regs(ctx, inputs[0]); let lhs = put_input_in_regs(ctx, inputs[0]);
let rhs = put_input_in_regs(ctx, inputs[1]); let rhs = put_input_in_regs(ctx, inputs[1]);
let dst = get_output_reg(ctx, outputs[0]); let dst = get_output_reg(ctx, outputs[0]);
@@ -1706,24 +1786,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
assert_eq!(rhs.len(), 2); assert_eq!(rhs.len(), 2);
assert_eq!(dst.len(), 2); assert_eq!(dst.len(), 2);
if op != Opcode::Imul {
// add, sub, and, or, xor: just do ops on lower then upper half. Carry-flag
// propagation is implicit (add/adc, sub/sbb).
ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
alu_ops.0,
RegMemImm::reg(rhs.regs()[0]),
dst.regs()[0],
));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
alu_ops.1,
RegMemImm::reg(rhs.regs()[1]),
dst.regs()[1],
));
} else {
// mul: // mul:
// dst_lo = lhs_lo * rhs_lo // dst_lo = lhs_lo * rhs_lo
// dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo // dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo
@@ -1783,48 +1845,25 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
RegMemImm::reg(regs::rdx()), RegMemImm::reg(regs::rdx()),
dst.regs()[1], dst.regs()[1],
)); ));
}
} else { } else {
let size = if ty == types::I64 { let size = if ty == types::I64 {
OperandSize::Size64 OperandSize::Size64
} else { } else {
OperandSize::Size32 OperandSize::Size32
}; };
let alu_op = match op { let alu_op = AluRmiROpcode::Mul;
Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add,
Opcode::Isub => AluRmiROpcode::Sub,
Opcode::Imul => AluRmiROpcode::Mul,
Opcode::Band => AluRmiROpcode::And,
Opcode::Bor => AluRmiROpcode::Or,
Opcode::Bxor => AluRmiROpcode::Xor,
_ => unreachable!(),
};
let (lhs, rhs) = match op { // For commutative operations, try to commute operands if one is
Opcode::Iadd // an immediate or direct memory reference. Do so by converting
| Opcode::IaddIfcout // LHS to RMI; if reg, then always convert RHS to RMI; else, use
| Opcode::Imul // LHS as RMI and convert RHS to reg.
| Opcode::Band
| Opcode::Bor
| Opcode::Bxor => {
// For commutative operations, try to commute operands if one is an
// immediate or direct memory reference. Do so by converting LHS to RMI; if
// reg, then always convert RHS to RMI; else, use LHS as RMI and convert
// RHS to reg.
let lhs = input_to_reg_mem_imm(ctx, inputs[0]); let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
if let RegMemImm::Reg { reg: lhs_reg } = lhs { let (lhs, rhs) = if let RegMemImm::Reg { reg: lhs_reg } = lhs {
let rhs = input_to_reg_mem_imm(ctx, inputs[1]); let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
(lhs_reg, rhs) (lhs_reg, rhs)
} else { } else {
let rhs_reg = put_input_in_reg(ctx, inputs[1]); let rhs_reg = put_input_in_reg(ctx, inputs[1]);
(rhs_reg, lhs) (rhs_reg, lhs)
}
}
Opcode::Isub => (
put_input_in_reg(ctx, inputs[0]),
input_to_reg_mem_imm(ctx, inputs[1]),
),
_ => unreachable!(),
}; };
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();