ISLE: port fmin, fmax, fmin_pseudo, fmax_pseudo on x64. (#3856)

This commit is contained in:
Chris Fallin
2022-02-28 14:40:26 -08:00
committed by GitHub
parent d9dfc44c32
commit cd173cfe8e
7 changed files with 938 additions and 482 deletions

View File

@@ -910,7 +910,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Fadd
| Opcode::Fsub
| Opcode::Fmul
| Opcode::Fdiv => implemented_in_isle(ctx),
| Opcode::Fdiv
| Opcode::Fmin
| Opcode::Fmax
| Opcode::FminPseudo
| Opcode::FmaxPseudo => implemented_in_isle(ctx),
Opcode::Icmp => {
let condcode = ctx.data(insn).cond_code().unwrap();
@@ -1278,235 +1282,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
};
}
Opcode::Fmin | Opcode::Fmax => {
let lhs = put_input_in_reg(ctx, inputs[0]);
let rhs = put_input_in_reg(ctx, inputs[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let is_min = op == Opcode::Fmin;
let output_ty = ty.unwrap();
ctx.emit(Inst::gen_move(dst, rhs, output_ty));
if !output_ty.is_vector() {
let op_size = match output_ty {
types::F32 => OperandSize::Size32,
types::F64 => OperandSize::Size64,
_ => panic!("unexpected type {:?} for fmin/fmax", output_ty),
};
ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst));
} else {
// X64's implementation of floating point min and floating point max does not
// propagate NaNs and +0's in a way that is friendly to the SIMD spec. For the
// scalar approach we use jumps to handle cases where NaN and +0 propagation is
// not consistent with what is needed. However for packed floating point min and
// floating point max we implement a different approach to avoid the sequence
// of jumps that would be required on a per lane basis. Because we do not need to
// lower labels and jumps but do need ctx for creating temporaries we implement
// the lowering here in lower.rs instead of emit.rs as is done in the case for scalars.
// The outline of approach is as follows:
//
// First we preform the Min/Max in both directions. This is because in the
// case of an operand's lane containing a NaN or in the case of the lanes of the
// two operands containing 0 but with mismatched signs, x64 will return the second
// operand regardless of its contents. So in order to make sure we capture NaNs and
// normalize NaNs and 0 values we capture the operation in both directions and merge the
// results. Then we normalize the results through operations that create a mask for the
// lanes containing NaNs, we use that mask to adjust NaNs to quite NaNs and normalize
// 0s.
//
// The following sequence is generated for min:
//
// movap{s,d} %lhs, %tmp
// minp{s,d} %dst, %tmp
// minp,{s,d} %lhs, %dst
// orp{s,d} %dst, %tmp
// cmpp{s,d} %tmp, %dst, $3
// orps{s,d} %dst, %tmp
// psrl{s,d} {$10, $13}, %dst
// andnp{s,d} %tmp, %dst
//
// and for max the sequence is:
//
// movap{s,d} %lhs, %tmp
// minp{s,d} %dst, %tmp
// minp,{s,d} %lhs, %dst
// xorp{s,d} %tmp, %dst
// orp{s,d} %dst, %tmp
// subp{s,d} %dst, %tmp
// cmpp{s,d} %tmp, %dst, $3
// psrl{s,d} {$10, $13}, %dst
// andnp{s,d} %tmp, %dst
if is_min {
let (mov_op, min_op, or_op, cmp_op, shift_op, shift_by, andn_op) =
match output_ty {
types::F32X4 => (
SseOpcode::Movaps,
SseOpcode::Minps,
SseOpcode::Orps,
SseOpcode::Cmpps,
SseOpcode::Psrld,
10,
SseOpcode::Andnps,
),
types::F64X2 => (
SseOpcode::Movapd,
SseOpcode::Minpd,
SseOpcode::Orpd,
SseOpcode::Cmppd,
SseOpcode::Psrlq,
13,
SseOpcode::Andnpd,
),
_ => unimplemented!("unsupported op type {:?}", output_ty),
};
// Copy lhs into tmp
let tmp_xmm1 = ctx.alloc_tmp(output_ty).only_reg().unwrap();
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
// Perform min in reverse direction
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
// Perform min in original direction
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
// X64 handles propagation of -0's and Nans differently between left and right
// operands. After doing the min in both directions, this OR will
// guarrentee capture of -0's and Nan in our tmp register
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
// Compare unordered to create mask for lanes containing NaNs and then use
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
// TODO: Would a check for NaN and then a jump be better here in the
// common case than continuing on to normalize NaNs that might not exist?
let cond = FcmpImm::from(FloatCC::Unordered);
ctx.emit(Inst::xmm_rm_r_imm(
cmp_op,
RegMem::reg(tmp_xmm1.to_reg()),
dst,
cond.encode(),
OperandSize::Size32,
));
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
// The dst register holds a mask for lanes containing NaNs.
// We take that mask and shift in preparation for creating a different mask
// to normalize NaNs (create a quite NaN) by zeroing out the appropriate
// number of least signficant bits. We shift right each lane by 10 bits
// (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
// 11 exp. + 1 MSB sig.) for F64X2.
ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
// Finally we do a nand with the tmp register to produce the final results
// in the dst.
ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
} else {
let (
mov_op,
max_op,
xor_op,
or_op,
sub_op,
cmp_op,
shift_op,
shift_by,
andn_op,
) = match output_ty {
types::F32X4 => (
SseOpcode::Movaps,
SseOpcode::Maxps,
SseOpcode::Xorps,
SseOpcode::Orps,
SseOpcode::Subps,
SseOpcode::Cmpps,
SseOpcode::Psrld,
10,
SseOpcode::Andnps,
),
types::F64X2 => (
SseOpcode::Movapd,
SseOpcode::Maxpd,
SseOpcode::Xorpd,
SseOpcode::Orpd,
SseOpcode::Subpd,
SseOpcode::Cmppd,
SseOpcode::Psrlq,
13,
SseOpcode::Andnpd,
),
_ => unimplemented!("unsupported op type {:?}", output_ty),
};
// Copy lhs into tmp.
let tmp_xmm1 = ctx.alloc_tmp(types::F32).only_reg().unwrap();
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1));
// Perform max in reverse direction.
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
// Perform max in original direction.
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
// Get the difference between the two results and store in tmp.
// Max uses a different approach than min to account for potential
// discrepancies with plus/minus 0.
ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
// X64 handles propagation of -0's and Nans differently between left and right
// operands. After doing the max in both directions, this OR will
// guarentee capture of 0's and Nan in our tmp register.
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
// Capture NaNs and sign discrepancies.
ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
// Compare unordered to create mask for lanes containing NaNs and then use
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
let cond = FcmpImm::from(FloatCC::Unordered);
ctx.emit(Inst::xmm_rm_r_imm(
cmp_op,
RegMem::reg(tmp_xmm1.to_reg()),
dst,
cond.encode(),
OperandSize::Size32,
));
// The dst register holds a mask for lanes containing NaNs.
// We take that mask and shift in preparation for creating a different mask
// to normalize NaNs (create a quite NaN) by zeroing out the appropriate
// number of least signficant bits. We shift right each lane by 10 bits
// (1 sign + 8 exp. + 1 MSB sig.) for F32X4 and by 13 bits (1 sign +
// 11 exp. + 1 MSB sig.) for F64X2.
ctx.emit(Inst::xmm_rmi_reg(shift_op, RegMemImm::imm(shift_by), dst));
// Finally we do a nand with the tmp register to produce the final results
// in the dst.
ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
}
}
}
Opcode::FminPseudo | Opcode::FmaxPseudo => {
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
// must avoid merging a load here.
let lhs = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
let rhs = put_input_in_reg(ctx, inputs[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ty = ty.unwrap();
ctx.emit(Inst::gen_move(dst, rhs, ty));
let sse_opcode = match (ty, op) {
(types::F32, Opcode::FminPseudo) => SseOpcode::Minss,
(types::F32, Opcode::FmaxPseudo) => SseOpcode::Maxss,
(types::F64, Opcode::FminPseudo) => SseOpcode::Minsd,
(types::F64, Opcode::FmaxPseudo) => SseOpcode::Maxsd,
(types::F32X4, Opcode::FminPseudo) => SseOpcode::Minps,
(types::F32X4, Opcode::FmaxPseudo) => SseOpcode::Maxps,
(types::F64X2, Opcode::FminPseudo) => SseOpcode::Minpd,
(types::F64X2, Opcode::FmaxPseudo) => SseOpcode::Maxpd,
_ => unimplemented!("unsupported type {} for {}", ty, op),
};
ctx.emit(Inst::xmm_rm_r(sse_opcode, lhs, dst));
}
Opcode::Sqrt => {
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
// must avoid merging a load here.