Provide spec-compliant legalization for SIMD floating point min/max
This commit is contained in:
@@ -1596,8 +1596,6 @@ fn define_simd(
|
|||||||
let fdiv = shared.by_name("fdiv");
|
let fdiv = shared.by_name("fdiv");
|
||||||
let fill = shared.by_name("fill");
|
let fill = shared.by_name("fill");
|
||||||
let fill_nop = shared.by_name("fill_nop");
|
let fill_nop = shared.by_name("fill_nop");
|
||||||
let fmax = shared.by_name("fmax");
|
|
||||||
let fmin = shared.by_name("fmin");
|
|
||||||
let fmul = shared.by_name("fmul");
|
let fmul = shared.by_name("fmul");
|
||||||
let fsub = shared.by_name("fsub");
|
let fsub = shared.by_name("fsub");
|
||||||
let iadd = shared.by_name("iadd");
|
let iadd = shared.by_name("iadd");
|
||||||
@@ -1637,6 +1635,8 @@ fn define_simd(
|
|||||||
let vselect = shared.by_name("vselect");
|
let vselect = shared.by_name("vselect");
|
||||||
let x86_cvtt2si = x86.by_name("x86_cvtt2si");
|
let x86_cvtt2si = x86.by_name("x86_cvtt2si");
|
||||||
let x86_insertps = x86.by_name("x86_insertps");
|
let x86_insertps = x86.by_name("x86_insertps");
|
||||||
|
let x86_fmax = x86.by_name("x86_fmax");
|
||||||
|
let x86_fmin = x86.by_name("x86_fmin");
|
||||||
let x86_movlhps = x86.by_name("x86_movlhps");
|
let x86_movlhps = x86.by_name("x86_movlhps");
|
||||||
let x86_movsd = x86.by_name("x86_movsd");
|
let x86_movsd = x86.by_name("x86_movsd");
|
||||||
let x86_packss = x86.by_name("x86_packss");
|
let x86_packss = x86.by_name("x86_packss");
|
||||||
@@ -2276,10 +2276,10 @@ fn define_simd(
|
|||||||
(F64, fmul, &MULPD[..]),
|
(F64, fmul, &MULPD[..]),
|
||||||
(F32, fdiv, &DIVPS[..]),
|
(F32, fdiv, &DIVPS[..]),
|
||||||
(F64, fdiv, &DIVPD[..]),
|
(F64, fdiv, &DIVPD[..]),
|
||||||
(F32, fmin, &MINPS[..]),
|
(F32, x86_fmin, &MINPS[..]),
|
||||||
(F64, fmin, &MINPD[..]),
|
(F64, x86_fmin, &MINPD[..]),
|
||||||
(F32, fmax, &MAXPS[..]),
|
(F32, x86_fmax, &MAXPS[..]),
|
||||||
(F64, fmax, &MAXPD[..]),
|
(F64, x86_fmax, &MAXPD[..]),
|
||||||
] {
|
] {
|
||||||
let inst = inst.bind(vector(*ty, sse_vector_size));
|
let inst = inst.bind(vector(*ty, sse_vector_size));
|
||||||
e.enc_both_inferred(inst, rec_fa.opcodes(opcodes));
|
e.enc_both_inferred(inst, rec_fa.opcodes(opcodes));
|
||||||
|
|||||||
@@ -379,10 +379,12 @@ fn define_simd(
|
|||||||
let bnot = insts.by_name("bnot");
|
let bnot = insts.by_name("bnot");
|
||||||
let bxor = insts.by_name("bxor");
|
let bxor = insts.by_name("bxor");
|
||||||
let extractlane = insts.by_name("extractlane");
|
let extractlane = insts.by_name("extractlane");
|
||||||
|
let fabs = insts.by_name("fabs");
|
||||||
let fcmp = insts.by_name("fcmp");
|
let fcmp = insts.by_name("fcmp");
|
||||||
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
|
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
|
||||||
let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
|
let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
|
||||||
let fabs = insts.by_name("fabs");
|
let fmax = insts.by_name("fmax");
|
||||||
|
let fmin = insts.by_name("fmin");
|
||||||
let fneg = insts.by_name("fneg");
|
let fneg = insts.by_name("fneg");
|
||||||
let iadd_imm = insts.by_name("iadd_imm");
|
let iadd_imm = insts.by_name("iadd_imm");
|
||||||
let icmp = insts.by_name("icmp");
|
let icmp = insts.by_name("icmp");
|
||||||
@@ -790,6 +792,8 @@ fn define_simd(
|
|||||||
narrow.custom_legalize(ushr, "convert_ushr");
|
narrow.custom_legalize(ushr, "convert_ushr");
|
||||||
narrow.custom_legalize(ishl, "convert_ishl");
|
narrow.custom_legalize(ishl, "convert_ishl");
|
||||||
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
|
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
|
||||||
|
narrow.custom_legalize(fmin, "expand_minmax_vector");
|
||||||
|
narrow.custom_legalize(fmax, "expand_minmax_vector");
|
||||||
|
|
||||||
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
||||||
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
|
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
|
||||||
|
|||||||
@@ -596,6 +596,100 @@ fn expand_minmax(
|
|||||||
cfg.recompute_block(pos.func, done);
|
cfg.recompute_block(pos.func, done);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This legalization converts a minimum/maximum operation into a sequence that matches the
|
||||||
|
/// non-x86-friendly WebAssembly semantics of NaN handling. This logic is kept separate from
|
||||||
|
/// [expand_minmax] above (the scalar version) for code clarity.
|
||||||
|
fn expand_minmax_vector(
|
||||||
|
inst: ir::Inst,
|
||||||
|
func: &mut ir::Function,
|
||||||
|
_cfg: &mut ControlFlowGraph,
|
||||||
|
_isa: &dyn TargetIsa,
|
||||||
|
) {
|
||||||
|
let ty = func.dfg.ctrl_typevar(inst);
|
||||||
|
debug_assert!(ty.is_vector());
|
||||||
|
let (x, y, x86_opcode, is_max) = match func.dfg[inst] {
|
||||||
|
ir::InstructionData::Binary {
|
||||||
|
opcode: ir::Opcode::Fmin,
|
||||||
|
args,
|
||||||
|
} => (args[0], args[1], ir::Opcode::X86Fmin, false),
|
||||||
|
ir::InstructionData::Binary {
|
||||||
|
opcode: ir::Opcode::Fmax,
|
||||||
|
args,
|
||||||
|
} => (args[0], args[1], ir::Opcode::X86Fmax, true),
|
||||||
|
_ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
||||||
|
pos.use_srcloc(inst);
|
||||||
|
|
||||||
|
// This sequence is complex due to how x86 handles NaNs and +0/-0. If x86 finds a NaN in
|
||||||
|
// either lane it returns the second operand; likewise, if both operands are in {+0.0, -0.0}
|
||||||
|
// it returns the second operand. To match the behavior of "return the minimum of the
|
||||||
|
// operands or a canonical NaN if either operand is NaN," we must compare in both
|
||||||
|
// directions.
|
||||||
|
let (forward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, x, y);
|
||||||
|
let forward = dfg.first_result(forward_inst);
|
||||||
|
let (backward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, y, x);
|
||||||
|
let backward = dfg.first_result(backward_inst);
|
||||||
|
|
||||||
|
let (value, mask) = if is_max {
|
||||||
|
// For maximum:
|
||||||
|
// Find any differences between the forward and backward `max` operation.
|
||||||
|
let difference = pos.ins().bxor(forward, backward);
|
||||||
|
// Merge in the differences.
|
||||||
|
let propagate_nans_and_plus_zero = pos.ins().bor(backward, difference);
|
||||||
|
let value = pos.ins().fsub(propagate_nans_and_plus_zero, difference);
|
||||||
|
// Discover which lanes have NaNs in them.
|
||||||
|
let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, difference, value);
|
||||||
|
(value, find_nan_lanes_mask)
|
||||||
|
} else {
|
||||||
|
// For minimum:
|
||||||
|
// If either lane is a NaN, we want to use these bits, not the second operand bits.
|
||||||
|
let propagate_nans = pos.ins().bor(backward, forward);
|
||||||
|
// Find which lanes contain a NaN with an unordered comparison, filling the mask with
|
||||||
|
// 1s.
|
||||||
|
let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, forward, propagate_nans);
|
||||||
|
let bitcast_find_nan_lanes_mask = pos.ins().raw_bitcast(ty, find_nan_lanes_mask);
|
||||||
|
// Then flood the value lane with all 1s if that lane is a NaN. This causes all NaNs
|
||||||
|
// along this code path to be quieted and negative: after the upcoming shift and and_not,
|
||||||
|
// all upper bits (sign, exponent, and payload MSB) will be 1s.
|
||||||
|
let tmp = pos.ins().bor(propagate_nans, bitcast_find_nan_lanes_mask);
|
||||||
|
(tmp, bitcast_find_nan_lanes_mask)
|
||||||
|
};
|
||||||
|
|
||||||
|
// During this lowering we will need to know how many bits to shift by and what type to
|
||||||
|
// convert to when using an integer shift. Recall that an IEEE754 number looks like:
|
||||||
|
// `[sign bit] [exponent bits] [significand bits]`
|
||||||
|
// A quiet NaN has all exponent bits set to 1 and the most significant bit of the
|
||||||
|
// significand set to 1; a signaling NaN has the same exponent but the MSB of the
|
||||||
|
// significand is set to 0. The payload of the NaN is the remaining significand bits, and
|
||||||
|
// WebAssembly assumes a canonical NaN is quiet and has 0s in its payload. To compute this
|
||||||
|
// canonical NaN, we create a mask for the top 10 bits on F32X4 (1 sign + 8 exp. + 1 MSB
|
||||||
|
// sig.) and the top 13 bits on F64X2 (1 sign + 11 exp. + 1 MSB sig.). This means that all
|
||||||
|
// NaNs produced with the mask will be negative (`-NaN`) which is allowed by the sign
|
||||||
|
// non-determinism in the spec: https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0
|
||||||
|
let (shift_by, ty_as_int) = match ty {
|
||||||
|
F32X4 => (10, I32X4),
|
||||||
|
F64X2 => (13, I64X2),
|
||||||
|
_ => unimplemented!("this legalization only understands 128-bit floating point types"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// In order to clear the NaN payload for canonical NaNs, we shift right the NaN lanes (all
|
||||||
|
// 1s) leaving 0s in the top bits. Remember that non-NaN lanes are all 0s so this has
|
||||||
|
// little effect.
|
||||||
|
let mask_as_int = pos.ins().raw_bitcast(ty_as_int, mask);
|
||||||
|
let shift_mask = pos.ins().ushr_imm(mask_as_int, shift_by);
|
||||||
|
let shift_mask_as_float = pos.ins().raw_bitcast(ty, shift_mask);
|
||||||
|
|
||||||
|
// Finally, we replace the value with `value & ~shift_mask`. For non-NaN lanes, this is
|
||||||
|
// equivalent to `... & 1111...` but for NaN lanes this will only have 1s in the top bits,
|
||||||
|
// clearing the payload.
|
||||||
|
pos.func
|
||||||
|
.dfg
|
||||||
|
.replace(inst)
|
||||||
|
.band_not(value, shift_mask_as_float);
|
||||||
|
}
|
||||||
|
|
||||||
/// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to
|
/// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to
|
||||||
/// i64 with a pattern, the rest needs more code.
|
/// i64 with a pattern, the rest needs more code.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -58,8 +58,8 @@ block0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm5]):
|
|||||||
[-, %xmm3] v3 = fsub v0, v1 ; bin: 0f 5c dd
|
[-, %xmm3] v3 = fsub v0, v1 ; bin: 0f 5c dd
|
||||||
[-, %xmm3] v4 = fmul v0, v1 ; bin: 0f 59 dd
|
[-, %xmm3] v4 = fmul v0, v1 ; bin: 0f 59 dd
|
||||||
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 0f 5e dd
|
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 0f 5e dd
|
||||||
[-, %xmm3] v6 = fmin v0, v1 ; bin: 0f 5d dd
|
[-, %xmm3] v6 = x86_fmin v0, v1 ; bin: 0f 5d dd
|
||||||
[-, %xmm3] v7 = fmax v0, v1 ; bin: 0f 5f dd
|
[-, %xmm3] v7 = x86_fmax v0, v1 ; bin: 0f 5f dd
|
||||||
[-, %xmm3] v8 = sqrt v0 ; bin: 0f 51 db
|
[-, %xmm3] v8 = sqrt v0 ; bin: 0f 51 db
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -70,8 +70,8 @@ block0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm10]):
|
|||||||
[-, %xmm3] v3 = fsub v0, v1 ; bin: 41 0f 5c da
|
[-, %xmm3] v3 = fsub v0, v1 ; bin: 41 0f 5c da
|
||||||
[-, %xmm3] v4 = fmul v0, v1 ; bin: 41 0f 59 da
|
[-, %xmm3] v4 = fmul v0, v1 ; bin: 41 0f 59 da
|
||||||
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 41 0f 5e da
|
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 41 0f 5e da
|
||||||
[-, %xmm3] v6 = fmin v0, v1 ; bin: 41 0f 5d da
|
[-, %xmm3] v6 = x86_fmin v0, v1 ; bin: 41 0f 5d da
|
||||||
[-, %xmm3] v7 = fmax v0, v1 ; bin: 41 0f 5f da
|
[-, %xmm3] v7 = x86_fmax v0, v1 ; bin: 41 0f 5f da
|
||||||
[-, %xmm3] v8 = sqrt v1 ; bin: 41 0f 51 da
|
[-, %xmm3] v8 = sqrt v1 ; bin: 41 0f 51 da
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -82,8 +82,8 @@ block0(v0: f64x2 [%xmm3], v1: f64x2 [%xmm5]):
|
|||||||
[-, %xmm3] v3 = fsub v0, v1 ; bin: 66 0f 5c dd
|
[-, %xmm3] v3 = fsub v0, v1 ; bin: 66 0f 5c dd
|
||||||
[-, %xmm3] v4 = fmul v0, v1 ; bin: 66 0f 59 dd
|
[-, %xmm3] v4 = fmul v0, v1 ; bin: 66 0f 59 dd
|
||||||
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 66 0f 5e dd
|
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 66 0f 5e dd
|
||||||
[-, %xmm3] v6 = fmin v0, v1 ; bin: 66 0f 5d dd
|
[-, %xmm3] v6 = x86_fmin v0, v1 ; bin: 66 0f 5d dd
|
||||||
[-, %xmm3] v7 = fmax v0, v1 ; bin: 66 0f 5f dd
|
[-, %xmm3] v7 = x86_fmax v0, v1 ; bin: 66 0f 5f dd
|
||||||
[-, %xmm3] v8 = sqrt v0 ; bin: 66 0f 51 db
|
[-, %xmm3] v8 = sqrt v0 ; bin: 66 0f 51 db
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -94,8 +94,8 @@ block0(v0: f64x2 [%xmm11], v1: f64x2 [%xmm13]):
|
|||||||
[-, %xmm11] v3 = fsub v0, v1 ; bin: 66 45 0f 5c dd
|
[-, %xmm11] v3 = fsub v0, v1 ; bin: 66 45 0f 5c dd
|
||||||
[-, %xmm11] v4 = fmul v0, v1 ; bin: 66 45 0f 59 dd
|
[-, %xmm11] v4 = fmul v0, v1 ; bin: 66 45 0f 59 dd
|
||||||
[-, %xmm11] v5 = fdiv v0, v1 ; bin: 66 45 0f 5e dd
|
[-, %xmm11] v5 = fdiv v0, v1 ; bin: 66 45 0f 5e dd
|
||||||
[-, %xmm11] v6 = fmin v0, v1 ; bin: 66 45 0f 5d dd
|
[-, %xmm11] v6 = x86_fmin v0, v1 ; bin: 66 45 0f 5d dd
|
||||||
[-, %xmm11] v7 = fmax v0, v1 ; bin: 66 45 0f 5f dd
|
[-, %xmm11] v7 = x86_fmax v0, v1 ; bin: 66 45 0f 5f dd
|
||||||
[-, %xmm11] v8 = sqrt v0 ; bin: 66 45 0f 51 db
|
[-, %xmm11] v8 = sqrt v0 ; bin: 66 45 0f 51 db
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -83,3 +83,35 @@ block0(v0:i64x2, v1:i64x2):
|
|||||||
; nextln: v2 = iadd v9, v8
|
; nextln: v2 = iadd v9, v8
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function %fmin_f32x4(f32x4, f32x4) {
|
||||||
|
block0(v0:f32x4, v1:f32x4):
|
||||||
|
v2 = fmin v0, v1
|
||||||
|
; check: v3 = x86_fmin v0, v1
|
||||||
|
; nextln: v4 = x86_fmin v1, v0
|
||||||
|
; nextln: v5 = bor v4, v3
|
||||||
|
; nextln: v6 = fcmp uno v3, v5
|
||||||
|
; nextln: v7 = raw_bitcast.f32x4 v6
|
||||||
|
; nextln: v8 = bor v5, v7
|
||||||
|
; nextln: v9 = raw_bitcast.i32x4 v7
|
||||||
|
; nextln: v10 = ushr_imm v9, 10
|
||||||
|
; nextln: v11 = raw_bitcast.f32x4 v10
|
||||||
|
; nextln: v2 = band_not v8, v11
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
function %fmax_f64x2(f64x2, f64x2) {
|
||||||
|
block0(v0:f64x2, v1:f64x2):
|
||||||
|
v2 = fmax v0, v1
|
||||||
|
; check: v3 = x86_fmax v0, v1
|
||||||
|
; nextln: v4 = x86_fmax v1, v0
|
||||||
|
; nextln: v5 = bxor v3, v4
|
||||||
|
; nextln: v6 = bor v4, v5
|
||||||
|
; nextln: v7 = fsub v6, v5
|
||||||
|
; nextln: v8 = fcmp uno v5, v7
|
||||||
|
; nextln: v9 = raw_bitcast.i64x2 v8
|
||||||
|
; nextln: v10 = ushr_imm v9, 13
|
||||||
|
; nextln: v11 = raw_bitcast.f64x2 v10
|
||||||
|
; nextln: v2 = band_not v7, v11
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|||||||
@@ -192,31 +192,31 @@ block0:
|
|||||||
}
|
}
|
||||||
; run
|
; run
|
||||||
|
|
||||||
function %fmax_f64x2() -> b1 {
|
function %fmax_f64x2(f64x2, f64x2) -> f64x2 {
|
||||||
block0:
|
block0(v0: f64x2, v1: f64x2):
|
||||||
v0 = vconst.f64x2 [-0.0 -0x1.0]
|
|
||||||
v1 = vconst.f64x2 [+0.0 +0x1.0]
|
|
||||||
|
|
||||||
v2 = fmax v0, v1
|
v2 = fmax v0, v1
|
||||||
v3 = fcmp eq v2, v1
|
return v2
|
||||||
v4 = vall_true v3
|
|
||||||
|
|
||||||
return v4
|
|
||||||
}
|
}
|
||||||
; run
|
; note below how NaNs are quieted but (unlike fmin), retain their sign: this discrepancy is allowed by non-determinism
|
||||||
|
; in the spec, see https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0.
|
||||||
function %fmin_f64x2() -> b1 {
|
; run: %fmax_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [+0x0.0 0x1.0]
|
||||||
block0:
|
; run: %fmax_f64x2([-NaN NaN], [0x0.0 0x100.0]) == [-NaN NaN]
|
||||||
v0 = vconst.f64x2 [-0x1.0 -0x1.0]
|
; run: %fmax_f64x2([NaN 0.0], [0.0 0.0]) == [NaN 0.0]
|
||||||
v1 = vconst.f64x2 [+0.0 +0x1.0]
|
; run: %fmax_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0]
|
||||||
|
; run: %fmax_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [NaN 0.0]
|
||||||
|
|
||||||
|
function %fmin_f64x2(f64x2, f64x2) -> f64x2 {
|
||||||
|
block0(v0: f64x2, v1: f64x2):
|
||||||
v2 = fmin v0, v1
|
v2 = fmin v0, v1
|
||||||
v3 = fcmp eq v2, v0
|
return v2
|
||||||
v4 = vall_true v3
|
|
||||||
|
|
||||||
return v4
|
|
||||||
}
|
}
|
||||||
; run
|
; note below how NaNs are quieted and negative: this is due to non-determinism in the spec for NaNs, see
|
||||||
|
; https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0.
|
||||||
|
; run: %fmin_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [-0x0.0 -0x1.0]
|
||||||
|
; run: %fmin_f64x2([-NaN 0x100.0], [0.0 NaN]) == [-NaN -NaN]
|
||||||
|
; run: %fmin_f64x2([NaN 0.0], [0.0 0.0]) == [-NaN 0.0]
|
||||||
|
; run: %fmin_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0]
|
||||||
|
; run: %fmin_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [-NaN 0.0]
|
||||||
|
|
||||||
function %fneg_f64x2() -> b1 {
|
function %fneg_f64x2() -> b1 {
|
||||||
block0:
|
block0:
|
||||||
|
|||||||
Reference in New Issue
Block a user