diff --git a/cranelift/codegen/src/opts/algebraic.isle b/cranelift/codegen/src/opts/algebraic.isle index 8193f7945e..c25fb5a1c3 100644 --- a/cranelift/codegen/src/opts/algebraic.isle +++ b/cranelift/codegen/src/opts/algebraic.isle @@ -345,3 +345,131 @@ (uextend $I64 x @ (value_type $I32)) (iconst _ (u64_from_imm64 0)))) (iconst ty (imm64 1))) + + +;; Transform select-of-icmp into {u,s}{min,max} instructions where possible. +(rule (simplify + (select ty (icmp _ (IntCC.SignedGreaterThan) x y) x y)) + (smax ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) x y)) + (smax ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.UnsignedGreaterThan) x y) x y)) + (umax ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) x y)) + (umax ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.SignedLessThan) x y) x y)) + (smin ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) x y)) + (smin ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.UnsignedLessThan) x y) x y)) + (umin ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) x y)) + (umin ty x y)) + + +;; These are the same rules as above, but when the operands for select are swapped +(rule (simplify + (select ty (icmp _ (IntCC.SignedLessThan) x y) y x)) + (smax ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) y x)) + (smax ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.UnsignedLessThan) x y) y x)) + (umax ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) y x)) + (umax ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.SignedGreaterThan) x y) y x)) + (smin ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) y x)) + (smin ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.UnsignedGreaterThan) x y) y x)) + (umin ty x y)) +(rule (simplify + (select ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x)) + (umin ty x y)) + +;; Transform vselect-of-icmp into {u,s}{min,max} instructions where possible. +(rule (simplify + (vselect ty (icmp _ (IntCC.SignedGreaterThan) x y) x y)) + (smax ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) x y)) + (smax ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) x y)) + (umax ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) x y)) + (umax ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.SignedLessThan) x y) x y)) + (smin ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) x y)) + (smin ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.UnsignedLessThan) x y) x y)) + (umin ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) x y)) + (umin ty x y)) + +;; These are the same rules as above, but when the operands for select are swapped +(rule (simplify + (vselect ty (icmp _ (IntCC.SignedLessThan) x y) y x)) + (smax ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) y x)) + (smax ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.UnsignedLessThan) x y) y x)) + (umax ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) y x)) + (umax ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.SignedGreaterThan) x y) y x)) + (smin ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) y x)) + (smin ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) y x)) + (umin ty x y)) +(rule (simplify + (vselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x)) + (umin ty x y)) + +;; For floats convert fcmp lt into pseudo_min and gt into pseudo_max +;; +;; fmax_pseudo docs state: +;; The behaviour for this operations is defined as fmax_pseudo(a, b) = (a < b) ? b : a, and the behaviour for zero +;; or NaN inputs follows from the behaviour of < with such inputs. +;; +;; That is exactly the operation that we match here! +(rule (simplify + (select ty (fcmp _ (FloatCC.LessThan) x y) x y)) + (fmin_pseudo ty x y)) +(rule (simplify + (select ty (fcmp _ (FloatCC.GreaterThan) x y) x y)) + (fmax_pseudo ty x y)) + +;; Do the same for vectors +(rule (simplify + (vselect ty (fcmp _ (FloatCC.LessThan) x y) x y)) + (fmin_pseudo ty x y)) +(rule (simplify + (vselect ty (fcmp _ (FloatCC.GreaterThan) x y) x y)) + (fmax_pseudo ty x y)) diff --git a/cranelift/filetests/filetests/egraph/licm.clif b/cranelift/filetests/filetests/egraph/licm.clif index 7e13392009..7d44f53fe8 100644 --- a/cranelift/filetests/filetests/egraph/licm.clif +++ b/cranelift/filetests/filetests/egraph/licm.clif @@ -31,8 +31,8 @@ block2(v9: i32): ; check: v8 = iadd v2, v3 ; check: brif v6, block2, block1(v8) + ; check: block2: ; check: v10 = iconst.i32 1 ; check: v4 = iadd.i32 v1, v10 ; check: return v4 - diff --git a/cranelift/filetests/filetests/egraph/select.clif b/cranelift/filetests/filetests/egraph/select.clif new file mode 100644 index 0000000000..12096ce8f1 --- /dev/null +++ b/cranelift/filetests/filetests/egraph/select.clif @@ -0,0 +1,155 @@ +test optimize +set opt_level=speed +set use_egraphs=true +target x86_64 +target aarch64 +target s390x +target riscv64 + +function %select_sgt_to_smax(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = icmp sgt v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: i32, v1: i32): +; check: v4 = smax v0, v1 +; check: return v4 + + +; This tests an inverted select, where the operands are swapped. +function %select_sgt_to_smax_inverse(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = icmp sgt v0, v1 + v3 = select v2, v1, v0 + return v3 +} + +; check: block0(v0: i32, v1: i32): +; check: v4 = smin v0, v1 +; check: return v4 + + +function %select_sge_to_smax(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = icmp sge v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: i32, v1: i32): +; check: v4 = smax v0, v1 +; check: return v4 + + +function %select_ugt_to_umax(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = icmp ugt v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: i32, v1: i32): +; check: v4 = umax v0, v1 +; check: return v4 + + +function %select_uge_to_umax(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = icmp uge v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: i32, v1: i32): +; check: v4 = umax v0, v1 +; check: return v4 + + + +function %select_slt_to_smin(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = icmp slt v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: i32, v1: i32): +; check: v4 = smin v0, v1 +; check: return v4 + + +function %select_sle_to_smin(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = icmp sle v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: i32, v1: i32): +; check: v4 = smin v0, v1 +; check: return v4 + + +function %select_ult_to_umin(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = icmp ult v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: i32, v1: i32): +; check: v4 = umin v0, v1 +; check: return v4 + + +function %select_ule_to_umin(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = icmp ule v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: i32, v1: i32): +; check: v4 = umin v0, v1 +; check: return v4 + + + +function %select_with_different_regs_does_not_optimize(i32, i32, i32, i32) -> i32 { +block0(v0: i32, v1: i32, v2: i32, v3: i32): + v4 = icmp ule v0, v1 + v5 = select v4, v2, v3 + return v5 +} + +; check: block0(v0: i32, v1: i32, v2: i32, v3: i32): +; check: v4 = icmp ule v0, v1 +; check: v5 = select v4, v2, v3 +; check: return v5 + + + + +function %select_fcmp_gt_to_fmax_pseudo(f32, f32) -> f32 { +block0(v0: f32, v1: f32): + v2 = fcmp gt v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: f32, v1: f32): +; check: v4 = fmax_pseudo v0, v1 +; check: return v4 + +function %select_fcmp_lt_to_fmin_pseudo(f32, f32) -> f32 { +block0(v0: f32, v1: f32): + v2 = fcmp lt v0, v1 + v3 = select v2, v0, v1 + return v3 +} + +; check: block0(v0: f32, v1: f32): +; check: v4 = fmin_pseudo v0, v1 +; check: return v4 diff --git a/cranelift/filetests/filetests/egraph/vselect.clif b/cranelift/filetests/filetests/egraph/vselect.clif new file mode 100644 index 0000000000..805f7b61cc --- /dev/null +++ b/cranelift/filetests/filetests/egraph/vselect.clif @@ -0,0 +1,154 @@ +test optimize +set opt_level=speed +set use_egraphs=true +target x86_64 +target aarch64 +target s390x + +function %vselect_sgt_to_smax(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp sgt v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: i32x4, v1: i32x4): +; check: v4 = smax v0, v1 +; check: return v4 + + +; This tests an inverted vselect, where the operands are swapped. +function %vselect_sgt_to_smax(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp sgt v0, v1 + v3 = vselect v2, v1, v0 + return v3 +} + +; check: block0(v0: i32x4, v1: i32x4): +; check: v4 = smin v0, v1 +; check: return v4 + + + +function %vselect_sge_to_smax(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp sge v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: i32x4, v1: i32x4): +; check: v4 = smax v0, v1 +; check: return v4 + + +function %vselect_ugt_to_umax(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp ugt v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: i32x4, v1: i32x4): +; check: v4 = umax v0, v1 +; check: return v4 + + +function %vselect_uge_to_umax(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp uge v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: i32x4, v1: i32x4): +; check: v4 = umax v0, v1 +; check: return v4 + + + +function %vselect_slt_to_smin(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp slt v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: i32x4, v1: i32x4): +; check: v4 = smin v0, v1 +; check: return v4 + + +function %vselect_sle_to_smin(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp sle v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: i32x4, v1: i32x4): +; check: v4 = smin v0, v1 +; check: return v4 + + +function %vselect_ult_to_umin(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp ult v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: i32x4, v1: i32x4): +; check: v4 = umin v0, v1 +; check: return v4 + + +function %vselect_ule_to_umin(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = icmp ule v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: i32x4, v1: i32x4): +; check: v4 = umin v0, v1 +; check: return v4 + + + +function %vselect_with_different_regs_does_not_optimize(i32x4, i32x4, i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4): + v4 = icmp ule v0, v1 + v5 = vselect v4, v2, v3 + return v5 +} + +; check: block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4): +; check: v4 = icmp ule v0, v1 +; check: v5 = vselect v4, v2, v3 +; check: return v5 + + + +function %vselect_fcmp_gt_to_fmax_pseudo(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp gt v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: f32x4, v1: f32x4): +; check: v4 = fmax_pseudo v0, v1 +; check: return v4 + +function %vselect_fcmp_lt_to_fmin_pseudo(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcmp lt v0, v1 + v3 = vselect v2, v0, v1 + return v3 +} + +; check: block0(v0: f32x4, v1: f32x4): +; check: v4 = fmin_pseudo v0, v1 +; check: return v4