diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index afceebff31..71a549647c 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1596,8 +1596,6 @@ fn define_simd( let fdiv = shared.by_name("fdiv"); let fill = shared.by_name("fill"); let fill_nop = shared.by_name("fill_nop"); - let fmax = shared.by_name("fmax"); - let fmin = shared.by_name("fmin"); let fmul = shared.by_name("fmul"); let fsub = shared.by_name("fsub"); let iadd = shared.by_name("iadd"); @@ -1637,6 +1635,8 @@ fn define_simd( let vselect = shared.by_name("vselect"); let x86_cvtt2si = x86.by_name("x86_cvtt2si"); let x86_insertps = x86.by_name("x86_insertps"); + let x86_fmax = x86.by_name("x86_fmax"); + let x86_fmin = x86.by_name("x86_fmin"); let x86_movlhps = x86.by_name("x86_movlhps"); let x86_movsd = x86.by_name("x86_movsd"); let x86_packss = x86.by_name("x86_packss"); @@ -2276,10 +2276,10 @@ fn define_simd( (F64, fmul, &MULPD[..]), (F32, fdiv, &DIVPS[..]), (F64, fdiv, &DIVPD[..]), - (F32, fmin, &MINPS[..]), - (F64, fmin, &MINPD[..]), - (F32, fmax, &MAXPS[..]), - (F64, fmax, &MAXPD[..]), + (F32, x86_fmin, &MINPS[..]), + (F64, x86_fmin, &MINPD[..]), + (F32, x86_fmax, &MAXPS[..]), + (F64, x86_fmax, &MAXPD[..]), ] { let inst = inst.bind(vector(*ty, sse_vector_size)); e.enc_both_inferred(inst, rec_fa.opcodes(opcodes)); diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 130205fee0..51453322e9 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -379,10 +379,12 @@ fn define_simd( let bnot = insts.by_name("bnot"); let bxor = insts.by_name("bxor"); let extractlane = insts.by_name("extractlane"); + let fabs = insts.by_name("fabs"); let fcmp = insts.by_name("fcmp"); let fcvt_from_uint = insts.by_name("fcvt_from_uint"); let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat"); - let fabs = insts.by_name("fabs"); + let fmax = insts.by_name("fmax"); + let fmin = insts.by_name("fmin"); let fneg = insts.by_name("fneg"); let iadd_imm = insts.by_name("iadd_imm"); let icmp = insts.by_name("icmp"); @@ -790,6 +792,8 @@ fn define_simd( narrow.custom_legalize(ushr, "convert_ushr"); narrow.custom_legalize(ishl, "convert_ishl"); narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector"); + narrow.custom_legalize(fmin, "expand_minmax_vector"); + narrow.custom_legalize(fmax, "expand_minmax_vector"); narrow_avx.custom_legalize(imul, "convert_i64x2_imul"); narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector"); diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index 42e5064263..b64140cfa8 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -596,6 +596,100 @@ fn expand_minmax( cfg.recompute_block(pos.func, done); } +/// This legalization converts a minimum/maximum operation into a sequence that matches the +/// non-x86-friendly WebAssembly semantics of NaN handling. This logic is kept separate from +/// [expand_minmax] above (the scalar version) for code clarity. +fn expand_minmax_vector( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let ty = func.dfg.ctrl_typevar(inst); + debug_assert!(ty.is_vector()); + let (x, y, x86_opcode, is_max) = match func.dfg[inst] { + ir::InstructionData::Binary { + opcode: ir::Opcode::Fmin, + args, + } => (args[0], args[1], ir::Opcode::X86Fmin, false), + ir::InstructionData::Binary { + opcode: ir::Opcode::Fmax, + args, + } => (args[0], args[1], ir::Opcode::X86Fmax, true), + _ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)), + }; + + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + // This sequence is complex due to how x86 handles NaNs and +0/-0. If x86 finds a NaN in + // either lane it returns the second operand; likewise, if both operands are in {+0.0, -0.0} + // it returns the second operand. To match the behavior of "return the minimum of the + // operands or a canonical NaN if either operand is NaN," we must compare in both + // directions. + let (forward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, x, y); + let forward = dfg.first_result(forward_inst); + let (backward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, y, x); + let backward = dfg.first_result(backward_inst); + + let (value, mask) = if is_max { + // For maximum: + // Find any differences between the forward and backward `max` operation. + let difference = pos.ins().bxor(forward, backward); + // Merge in the differences. + let propagate_nans_and_plus_zero = pos.ins().bor(backward, difference); + let value = pos.ins().fsub(propagate_nans_and_plus_zero, difference); + // Discover which lanes have NaNs in them. + let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, difference, value); + (value, find_nan_lanes_mask) + } else { + // For minimum: + // If either lane is a NaN, we want to use these bits, not the second operand bits. + let propagate_nans = pos.ins().bor(backward, forward); + // Find which lanes contain a NaN with an unordered comparison, filling the mask with + // 1s. + let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, forward, propagate_nans); + let bitcast_find_nan_lanes_mask = pos.ins().raw_bitcast(ty, find_nan_lanes_mask); + // Then flood the value lane with all 1s if that lane is a NaN. This causes all NaNs + // along this code path to be quieted and negative: after the upcoming shift and and_not, + // all upper bits (sign, exponent, and payload MSB) will be 1s. + let tmp = pos.ins().bor(propagate_nans, bitcast_find_nan_lanes_mask); + (tmp, bitcast_find_nan_lanes_mask) + }; + + // During this lowering we will need to know how many bits to shift by and what type to + // convert to when using an integer shift. Recall that an IEEE754 number looks like: + // `[sign bit] [exponent bits] [significand bits]` + // A quiet NaN has all exponent bits set to 1 and the most significant bit of the + // significand set to 1; a signaling NaN has the same exponent but the MSB of the + // significand is set to 0. The payload of the NaN is the remaining significand bits, and + // WebAssembly assumes a canonical NaN is quiet and has 0s in its payload. To compute this + // canonical NaN, we create a mask for the top 10 bits on F32X4 (1 sign + 8 exp. + 1 MSB + // sig.) and the top 13 bits on F64X2 (1 sign + 11 exp. + 1 MSB sig.). This means that all + // NaNs produced with the mask will be negative (`-NaN`) which is allowed by the sign + // non-determinism in the spec: https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0 + let (shift_by, ty_as_int) = match ty { + F32X4 => (10, I32X4), + F64X2 => (13, I64X2), + _ => unimplemented!("this legalization only understands 128-bit floating point types"), + }; + + // In order to clear the NaN payload for canonical NaNs, we shift right the NaN lanes (all + // 1s) leaving 0s in the top bits. Remember that non-NaN lanes are all 0s so this has + // little effect. + let mask_as_int = pos.ins().raw_bitcast(ty_as_int, mask); + let shift_mask = pos.ins().ushr_imm(mask_as_int, shift_by); + let shift_mask_as_float = pos.ins().raw_bitcast(ty, shift_mask); + + // Finally, we replace the value with `value & ~shift_mask`. For non-NaN lanes, this is + // equivalent to `... & 1111...` but for NaN lanes this will only have 1s in the top bits, + // clearing the payload. + pos.func + .dfg + .replace(inst) + .band_not(value, shift_mask_as_float); +} + /// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to /// i64 with a pattern, the rest needs more code. /// diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif index f42ba11a96..6a215a8e6c 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif @@ -58,8 +58,8 @@ block0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm5]): [-, %xmm3] v3 = fsub v0, v1 ; bin: 0f 5c dd [-, %xmm3] v4 = fmul v0, v1 ; bin: 0f 59 dd [-, %xmm3] v5 = fdiv v0, v1 ; bin: 0f 5e dd -[-, %xmm3] v6 = fmin v0, v1 ; bin: 0f 5d dd -[-, %xmm3] v7 = fmax v0, v1 ; bin: 0f 5f dd +[-, %xmm3] v6 = x86_fmin v0, v1 ; bin: 0f 5d dd +[-, %xmm3] v7 = x86_fmax v0, v1 ; bin: 0f 5f dd [-, %xmm3] v8 = sqrt v0 ; bin: 0f 51 db return } @@ -70,8 +70,8 @@ block0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm10]): [-, %xmm3] v3 = fsub v0, v1 ; bin: 41 0f 5c da [-, %xmm3] v4 = fmul v0, v1 ; bin: 41 0f 59 da [-, %xmm3] v5 = fdiv v0, v1 ; bin: 41 0f 5e da -[-, %xmm3] v6 = fmin v0, v1 ; bin: 41 0f 5d da -[-, %xmm3] v7 = fmax v0, v1 ; bin: 41 0f 5f da +[-, %xmm3] v6 = x86_fmin v0, v1 ; bin: 41 0f 5d da +[-, %xmm3] v7 = x86_fmax v0, v1 ; bin: 41 0f 5f da [-, %xmm3] v8 = sqrt v1 ; bin: 41 0f 51 da return } @@ -82,8 +82,8 @@ block0(v0: f64x2 [%xmm3], v1: f64x2 [%xmm5]): [-, %xmm3] v3 = fsub v0, v1 ; bin: 66 0f 5c dd [-, %xmm3] v4 = fmul v0, v1 ; bin: 66 0f 59 dd [-, %xmm3] v5 = fdiv v0, v1 ; bin: 66 0f 5e dd -[-, %xmm3] v6 = fmin v0, v1 ; bin: 66 0f 5d dd -[-, %xmm3] v7 = fmax v0, v1 ; bin: 66 0f 5f dd +[-, %xmm3] v6 = x86_fmin v0, v1 ; bin: 66 0f 5d dd +[-, %xmm3] v7 = x86_fmax v0, v1 ; bin: 66 0f 5f dd [-, %xmm3] v8 = sqrt v0 ; bin: 66 0f 51 db return } @@ -94,8 +94,8 @@ block0(v0: f64x2 [%xmm11], v1: f64x2 [%xmm13]): [-, %xmm11] v3 = fsub v0, v1 ; bin: 66 45 0f 5c dd [-, %xmm11] v4 = fmul v0, v1 ; bin: 66 45 0f 59 dd [-, %xmm11] v5 = fdiv v0, v1 ; bin: 66 45 0f 5e dd -[-, %xmm11] v6 = fmin v0, v1 ; bin: 66 45 0f 5d dd -[-, %xmm11] v7 = fmax v0, v1 ; bin: 66 45 0f 5f dd +[-, %xmm11] v6 = x86_fmin v0, v1 ; bin: 66 45 0f 5d dd +[-, %xmm11] v7 = x86_fmax v0, v1 ; bin: 66 45 0f 5f dd [-, %xmm11] v8 = sqrt v0 ; bin: 66 45 0f 51 db return } diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif index 39814b37bb..976ea2d02b 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif @@ -83,3 +83,35 @@ block0(v0:i64x2, v1:i64x2): ; nextln: v2 = iadd v9, v8 return } + +function %fmin_f32x4(f32x4, f32x4) { +block0(v0:f32x4, v1:f32x4): + v2 = fmin v0, v1 + ; check: v3 = x86_fmin v0, v1 + ; nextln: v4 = x86_fmin v1, v0 + ; nextln: v5 = bor v4, v3 + ; nextln: v6 = fcmp uno v3, v5 + ; nextln: v7 = raw_bitcast.f32x4 v6 + ; nextln: v8 = bor v5, v7 + ; nextln: v9 = raw_bitcast.i32x4 v7 + ; nextln: v10 = ushr_imm v9, 10 + ; nextln: v11 = raw_bitcast.f32x4 v10 + ; nextln: v2 = band_not v8, v11 + return +} + +function %fmax_f64x2(f64x2, f64x2) { +block0(v0:f64x2, v1:f64x2): + v2 = fmax v0, v1 + ; check: v3 = x86_fmax v0, v1 + ; nextln: v4 = x86_fmax v1, v0 + ; nextln: v5 = bxor v3, v4 + ; nextln: v6 = bor v4, v5 + ; nextln: v7 = fsub v6, v5 + ; nextln: v8 = fcmp uno v5, v7 + ; nextln: v9 = raw_bitcast.i64x2 v8 + ; nextln: v10 = ushr_imm v9, 13 + ; nextln: v11 = raw_bitcast.f64x2 v10 + ; nextln: v2 = band_not v7, v11 + return +} diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif index 103a078547..601defecfd 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif @@ -192,31 +192,31 @@ block0: } ; run -function %fmax_f64x2() -> b1 { -block0: - v0 = vconst.f64x2 [-0.0 -0x1.0] - v1 = vconst.f64x2 [+0.0 +0x1.0] - +function %fmax_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): v2 = fmax v0, v1 - v3 = fcmp eq v2, v1 - v4 = vall_true v3 - - return v4 + return v2 } -; run - -function %fmin_f64x2() -> b1 { -block0: - v0 = vconst.f64x2 [-0x1.0 -0x1.0] - v1 = vconst.f64x2 [+0.0 +0x1.0] +; note below how NaNs are quieted but (unlike fmin), retain their sign: this discrepancy is allowed by non-determinism +; in the spec, see https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0. +; run: %fmax_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [+0x0.0 0x1.0] +; run: %fmax_f64x2([-NaN NaN], [0x0.0 0x100.0]) == [-NaN NaN] +; run: %fmax_f64x2([NaN 0.0], [0.0 0.0]) == [NaN 0.0] +; run: %fmax_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0] +; run: %fmax_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [NaN 0.0] +function %fmin_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): v2 = fmin v0, v1 - v3 = fcmp eq v2, v0 - v4 = vall_true v3 - - return v4 + return v2 } -; run +; note below how NaNs are quieted and negative: this is due to non-determinism in the spec for NaNs, see +; https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0. +; run: %fmin_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [-0x0.0 -0x1.0] +; run: %fmin_f64x2([-NaN 0x100.0], [0.0 NaN]) == [-NaN -NaN] +; run: %fmin_f64x2([NaN 0.0], [0.0 0.0]) == [-NaN 0.0] +; run: %fmin_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0] +; run: %fmin_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [-NaN 0.0] function %fneg_f64x2() -> b1 { block0: