diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index a49b998914..6aa4f5509f 100755 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -1409,8 +1409,8 @@ pub(crate) fn define( r#" Conditional select. - This instruction selects whole values. Use `vselect` for - lane-wise selection. + This instruction selects whole values. Use `bitselect` to choose each + bit according to a mask. "#, &formats.ternary, ) @@ -1458,7 +1458,7 @@ pub(crate) fn define( For each bit in `c`, this instruction selects the corresponding bit from `x` if the bit in `x` is 1 and the corresponding bit from `y` if the bit in `c` is 0. See also: - `select`, `vselect`. + `select`. "#, &formats.ternary, ) @@ -1484,26 +1484,7 @@ pub(crate) fn define( .operands_out(vec![a]), ); - let c = &Operand::new("c", &TxN.as_bool()).with_doc("Controlling vector"); - let x = &Operand::new("x", TxN).with_doc("Value to use where `c` is true"); - let y = &Operand::new("y", TxN).with_doc("Value to use where `c` is false"); let a = &Operand::new("a", TxN); - - ig.push( - Inst::new( - "vselect", - r#" - Vector lane select. - - Select lanes from ``x`` or ``y`` controlled by the lanes of the truthy - vector ``c``. - "#, - &formats.ternary, - ) - .operands_in(vec![c, x, y]) - .operands_out(vec![a]), - ); - let s = &Operand::new("s", i8); ig.push( diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index bd7e968d72..edb1124473 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -1659,11 +1659,6 @@ (rule 1 (lower (has_type (ty_vec128 ty) (bitselect c x y))) (bsl ty c x y)) -;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule (lower (has_type (ty_vec128 ty) (vselect c x y))) - (bsl ty c x y)) - ;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; T -> I{64,32,16,8}: We can simply pass through the value: values diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index acbbf2082c..02563f4d5b 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -1170,13 +1170,6 @@ (vec_select ty y z x)) -;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Vector select. -(rule (lower (has_type (ty_vec128 ty) (vselect x y z))) - (vec_select ty y z x)) - - ;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type ty (bmask x))) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index f6abbec48d..ac27aae4d6 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1193,7 +1193,7 @@ (sse_or ty b a))) ;; If every byte of the condition is guaranteed to be all ones or all zeroes, -;; we can use x86_blend like vselect does. +;; we can use x64_blend. (rule 1 (lower (has_type ty @ (multi_lane _bits _lanes) (bitselect condition if_true @@ -1226,15 +1226,6 @@ (x86_blendv condition if_true if_false))) (x64_blendvpd if_false if_true condition)) -;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule (lower (has_type ty @ (multi_lane _bits _lanes) - (vselect condition if_true if_false))) - (x64_blend ty - condition - if_true - if_false)) - ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx))) diff --git a/cranelift/codegen/src/nan_canonicalization.rs b/cranelift/codegen/src/nan_canonicalization.rs index 40600fc6fb..49415a86b0 100644 --- a/cranelift/codegen/src/nan_canonicalization.rs +++ b/cranelift/codegen/src/nan_canonicalization.rs @@ -7,6 +7,7 @@ use crate::ir::condcodes::FloatCC; use crate::ir::immediates::{Ieee32, Ieee64}; use crate::ir::types; use crate::ir::{Function, Inst, InstBuilder, InstructionData, Opcode, Value}; +use crate::opts::MemFlags; use crate::timing; // Canonical 32-bit and 64-bit NaN values. @@ -70,9 +71,10 @@ fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) { .select(is_nan, canon_nan, new_res); }; let vector_select = |pos: &mut FuncCursor, canon_nan: Value| { + let is_nan = pos.ins().bitcast(val_type, MemFlags::new(), is_nan); pos.ins() .with_result(val) - .vselect(is_nan, canon_nan, new_res); + .bitselect(is_nan, canon_nan, new_res); }; match val_type { diff --git a/cranelift/codegen/src/opts/algebraic.isle b/cranelift/codegen/src/opts/algebraic.isle index eac1650654..d7663df031 100644 --- a/cranelift/codegen/src/opts/algebraic.isle +++ b/cranelift/codegen/src/opts/algebraic.isle @@ -454,56 +454,56 @@ (select ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x)) (umin ty x y)) -;; Transform vselect-of-icmp into {u,s}{min,max} instructions where possible. +;; Transform bitselect-of-icmp into {u,s}{min,max} instructions where possible. (rule (simplify - (vselect ty (icmp _ (IntCC.SignedGreaterThan) x y) x y)) + (bitselect ty (icmp _ (IntCC.SignedGreaterThan) x y) x y)) (smax ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) x y)) + (bitselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) x y)) (smax ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) x y)) + (bitselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) x y)) (umax ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) x y)) + (bitselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) x y)) (umax ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.SignedLessThan) x y) x y)) + (bitselect ty (icmp _ (IntCC.SignedLessThan) x y) x y)) (smin ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) x y)) + (bitselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) x y)) (smin ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.UnsignedLessThan) x y) x y)) + (bitselect ty (icmp _ (IntCC.UnsignedLessThan) x y) x y)) (umin ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) x y)) + (bitselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) x y)) (umin ty x y)) ;; These are the same rules as above, but when the operands for select are swapped (rule (simplify - (vselect ty (icmp _ (IntCC.SignedLessThan) x y) y x)) + (bitselect ty (icmp _ (IntCC.SignedLessThan) x y) y x)) (smax ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) y x)) + (bitselect ty (icmp _ (IntCC.SignedLessThanOrEqual) x y) y x)) (smax ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.UnsignedLessThan) x y) y x)) + (bitselect ty (icmp _ (IntCC.UnsignedLessThan) x y) y x)) (umax ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) y x)) + (bitselect ty (icmp _ (IntCC.UnsignedLessThanOrEqual) x y) y x)) (umax ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.SignedGreaterThan) x y) y x)) + (bitselect ty (icmp _ (IntCC.SignedGreaterThan) x y) y x)) (smin ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) y x)) + (bitselect ty (icmp _ (IntCC.SignedGreaterThanOrEqual) x y) y x)) (smin ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) y x)) + (bitselect ty (icmp _ (IntCC.UnsignedGreaterThan) x y) y x)) (umin ty x y)) (rule (simplify - (vselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x)) + (bitselect ty (icmp _ (IntCC.UnsignedGreaterThanOrEqual) x y) y x)) (umin ty x y)) ;; For floats convert fcmp lt into pseudo_min and gt into pseudo_max @@ -520,13 +520,9 @@ (select ty (fcmp _ (FloatCC.GreaterThan) x y) x y)) (fmax_pseudo ty x y)) -;; Do the same for vectors -(rule (simplify - (vselect ty (fcmp _ (FloatCC.LessThan) x y) x y)) - (fmin_pseudo ty x y)) -(rule (simplify - (vselect ty (fcmp _ (FloatCC.GreaterThan) x y) x y)) - (fmax_pseudo ty x y)) +;; TODO: perform this same optimization to `f{min,max}_pseudo` for vectors +;; with the `bitselect` instruction, but the pattern is a bit more complicated +;; due to most bitselects-over-floats having bitcasts. ;; If both of the multiplied arguments to an `fma` are negated then remove ;; both of them since they cancel out. diff --git a/cranelift/filetests/filetests/egraph/vselect.clif b/cranelift/filetests/filetests/egraph/bitselect.clif similarity index 54% rename from cranelift/filetests/filetests/egraph/vselect.clif rename to cranelift/filetests/filetests/egraph/bitselect.clif index 805f7b61cc..91797bb397 100644 --- a/cranelift/filetests/filetests/egraph/vselect.clif +++ b/cranelift/filetests/filetests/egraph/bitselect.clif @@ -5,10 +5,10 @@ target x86_64 target aarch64 target s390x -function %vselect_sgt_to_smax(i32x4, i32x4) -> i32x4 { +function %bitselect_sgt_to_smax(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = icmp sgt v0, v1 - v3 = vselect v2, v0, v1 + v3 = bitselect v2, v0, v1 return v3 } @@ -17,11 +17,11 @@ block0(v0: i32x4, v1: i32x4): ; check: return v4 -; This tests an inverted vselect, where the operands are swapped. -function %vselect_sgt_to_smax(i32x4, i32x4) -> i32x4 { +; This tests an inverted bitselect, where the operands are swapped. +function %bitselect_sgt_to_smax(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = icmp sgt v0, v1 - v3 = vselect v2, v1, v0 + v3 = bitselect v2, v1, v0 return v3 } @@ -31,10 +31,10 @@ block0(v0: i32x4, v1: i32x4): -function %vselect_sge_to_smax(i32x4, i32x4) -> i32x4 { +function %bitselect_sge_to_smax(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = icmp sge v0, v1 - v3 = vselect v2, v0, v1 + v3 = bitselect v2, v0, v1 return v3 } @@ -43,10 +43,10 @@ block0(v0: i32x4, v1: i32x4): ; check: return v4 -function %vselect_ugt_to_umax(i32x4, i32x4) -> i32x4 { +function %bitselect_ugt_to_umax(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = icmp ugt v0, v1 - v3 = vselect v2, v0, v1 + v3 = bitselect v2, v0, v1 return v3 } @@ -55,10 +55,10 @@ block0(v0: i32x4, v1: i32x4): ; check: return v4 -function %vselect_uge_to_umax(i32x4, i32x4) -> i32x4 { +function %bitselect_uge_to_umax(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = icmp uge v0, v1 - v3 = vselect v2, v0, v1 + v3 = bitselect v2, v0, v1 return v3 } @@ -68,10 +68,10 @@ block0(v0: i32x4, v1: i32x4): -function %vselect_slt_to_smin(i32x4, i32x4) -> i32x4 { +function %bitselect_slt_to_smin(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = icmp slt v0, v1 - v3 = vselect v2, v0, v1 + v3 = bitselect v2, v0, v1 return v3 } @@ -80,10 +80,10 @@ block0(v0: i32x4, v1: i32x4): ; check: return v4 -function %vselect_sle_to_smin(i32x4, i32x4) -> i32x4 { +function %bitselect_sle_to_smin(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = icmp sle v0, v1 - v3 = vselect v2, v0, v1 + v3 = bitselect v2, v0, v1 return v3 } @@ -92,10 +92,10 @@ block0(v0: i32x4, v1: i32x4): ; check: return v4 -function %vselect_ult_to_umin(i32x4, i32x4) -> i32x4 { +function %bitselect_ult_to_umin(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = icmp ult v0, v1 - v3 = vselect v2, v0, v1 + v3 = bitselect v2, v0, v1 return v3 } @@ -104,10 +104,10 @@ block0(v0: i32x4, v1: i32x4): ; check: return v4 -function %vselect_ule_to_umin(i32x4, i32x4) -> i32x4 { +function %bitselect_ule_to_umin(i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4): v2 = icmp ule v0, v1 - v3 = vselect v2, v0, v1 + v3 = bitselect v2, v0, v1 return v3 } @@ -117,38 +117,14 @@ block0(v0: i32x4, v1: i32x4): -function %vselect_with_different_regs_does_not_optimize(i32x4, i32x4, i32x4, i32x4) -> i32x4 { +function %bitselect_with_different_regs_does_not_optimize(i32x4, i32x4, i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4): v4 = icmp ule v0, v1 - v5 = vselect v4, v2, v3 + v5 = bitselect v4, v2, v3 return v5 } ; check: block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4): ; check: v4 = icmp ule v0, v1 -; check: v5 = vselect v4, v2, v3 +; check: v5 = bitselect v4, v2, v3 ; check: return v5 - - - -function %vselect_fcmp_gt_to_fmax_pseudo(f32x4, f32x4) -> f32x4 { -block0(v0: f32x4, v1: f32x4): - v2 = fcmp gt v0, v1 - v3 = vselect v2, v0, v1 - return v3 -} - -; check: block0(v0: f32x4, v1: f32x4): -; check: v4 = fmax_pseudo v0, v1 -; check: return v4 - -function %vselect_fcmp_lt_to_fmin_pseudo(f32x4, f32x4) -> f32x4 { -block0(v0: f32x4, v1: f32x4): - v2 = fcmp lt v0, v1 - v3 = vselect v2, v0, v1 - return v3 -} - -; check: block0(v0: f32x4, v1: f32x4): -; check: v4 = fmin_pseudo v0, v1 -; check: return v4 diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif index 0ae0eb407c..b4449a9670 100644 --- a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif @@ -173,7 +173,7 @@ block0: function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { block0(v0: i16x8, v1: i16x8, v2: i16x8): - v3 = vselect v0, v1, v2 + v3 = bitselect v0, v1, v2 return v3 } @@ -187,9 +187,9 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8): ; bsl v0.16b, v1.16b, v2.16b ; ret -function %vselect_f32x4(i32x4, f32x4, f32x4) -> f32x4 { -block0(v0: i32x4, v1: f32x4, v2: f32x4): - v3 = vselect v0, v1, v2 +function %vselect_f32x4(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = bitselect v0, v1, v2 return v3 } @@ -203,9 +203,9 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4): ; bsl v0.16b, v1.16b, v2.16b ; ret -function %vselect_f64x2(i64x2, f64x2, f64x2) -> f64x2 { -block0(v0: i64x2, v1: f64x2, v2: f64x2): - v3 = vselect v0, v1, v2 +function %vselect_f64x2(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = bitselect v0, v1, v2 return v3 } diff --git a/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif b/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif index 1fdbb2e64f..0a76e10711 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif @@ -514,67 +514,3 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16): ; vsel %v24, %v25, %v26, %v24 ; br %r14 -function %vselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2, v2: i64x2): - v3 = vselect.i64x2 v0, v1, v2 - return v3 -} - -; VCode: -; block0: -; vsel %v24, %v25, %v26, %v24 -; br %r14 -; -; Disassembled: -; block0: ; offset 0x0 -; vsel %v24, %v25, %v26, %v24 -; br %r14 - -function %vselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 { -block0(v0: i32x4, v1: i32x4, v2: i32x4): - v3 = vselect.i32x4 v0, v1, v2 - return v3 -} - -; VCode: -; block0: -; vsel %v24, %v25, %v26, %v24 -; br %r14 -; -; Disassembled: -; block0: ; offset 0x0 -; vsel %v24, %v25, %v26, %v24 -; br %r14 - -function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { -block0(v0: i16x8, v1: i16x8, v2: i16x8): - v3 = vselect.i16x8 v0, v1, v2 - return v3 -} - -; VCode: -; block0: -; vsel %v24, %v25, %v26, %v24 -; br %r14 -; -; Disassembled: -; block0: ; offset 0x0 -; vsel %v24, %v25, %v26, %v24 -; br %r14 - -function %vselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 { -block0(v0: i8x16, v1: i8x16, v2: i8x16): - v3 = vselect.i8x16 v0, v1, v2 - return v3 -} - -; VCode: -; block0: -; vsel %v24, %v25, %v26, %v24 -; br %r14 -; -; Disassembled: -; block0: ; offset 0x0 -; vsel %v24, %v25, %v26, %v24 -; br %r14 - diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif index 479844fe63..4cf4352956 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif @@ -433,7 +433,7 @@ block0(v0: f64x2, v1: f64x2): function %i16x8_bitselect(i16x8, i16x8, i16x8) -> i16x8 { block0(v0: i16x8, v1: i16x8, v2: i16x8): - v3 = vselect v0, v1, v2 + v3 = bitselect v0, v1, v2 return v3 } @@ -441,7 +441,9 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vpblendvb %xmm0, %xmm1, %xmm0, %xmm2 +; vpand %xmm1, %xmm0, %xmm4 +; vpandn %xmm0, %xmm2, %xmm6 +; vpor %xmm6, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -451,14 +453,16 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; vpand %xmm0, %xmm1, %xmm4 +; vpandn %xmm2, %xmm0, %xmm6 +; vpor %xmm4, %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq -function %i32x4_bitselect(i32x4, f32x4, f32x4) -> f32x4 { -block0(v0: i32x4, v1: f32x4, v2: f32x4): - v3 = vselect v0, v1, v2 +function %f32x4_bitselect(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = bitselect v0, v1, v2 return v3 } @@ -466,7 +470,9 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vblendvps %xmm0, %xmm1, %xmm0, %xmm2 +; vandps %xmm1, %xmm0, %xmm4 +; vandnps %xmm0, %xmm2, %xmm6 +; vorps %xmm6, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -476,14 +482,16 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; vandps %xmm0, %xmm1, %xmm4 +; vandnps %xmm2, %xmm0, %xmm6 +; vorps %xmm4, %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq -function %i64x2_bitselect(i64x2, f64x2, f64x2) -> f64x2 { -block0(v0: i64x2, v1: f64x2, v2: f64x2): - v3 = vselect v0, v1, v2 +function %f64x2_bitselect(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = bitselect v0, v1, v2 return v3 } @@ -491,7 +499,9 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; vblendvpd %xmm0, %xmm1, %xmm0, %xmm2 +; vandpd %xmm1, %xmm0, %xmm4 +; vandnpd %xmm0, %xmm2, %xmm6 +; vorpd %xmm6, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -501,7 +511,9 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; vandpd %xmm0, %xmm1, %xmm4 +; vandnpd %xmm2, %xmm0, %xmm6 +; vorpd %xmm4, %xmm6, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index f63cc22313..a0e4a9c279 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -229,7 +229,7 @@ block0(v0: i32x4, v1: i32x4): function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { block0(v0: i16x8, v1: i16x8, v2: i16x8): - v3 = vselect v0, v1, v2 + v3 = bitselect v0, v1, v2 return v3 } @@ -237,9 +237,10 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm2, %xmm4 -; pblendvb %xmm4, %xmm1, %xmm4 -; movdqa %xmm4, %xmm0 +; movdqa %xmm1, %xmm4 +; pand %xmm4, %xmm0, %xmm4 +; pandn %xmm0, %xmm2, %xmm0 +; por %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -249,16 +250,17 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm2, %xmm4 -; pblendvb %xmm0, %xmm1, %xmm4 -; movdqa %xmm4, %xmm0 +; movdqa %xmm1, %xmm4 +; pand %xmm0, %xmm4 +; pandn %xmm2, %xmm0 +; por %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq -function %vselect_f32x4(i32x4, f32x4, f32x4) -> f32x4 { -block0(v0: i32x4, v1: f32x4, v2: f32x4): - v3 = vselect v0, v1, v2 +function %vselect_f32x4(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = bitselect v0, v1, v2 return v3 } @@ -266,9 +268,10 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm2, %xmm4 -; blendvps %xmm4, %xmm1, %xmm4 -; movdqa %xmm4, %xmm0 +; movdqa %xmm1, %xmm4 +; andps %xmm4, %xmm0, %xmm4 +; andnps %xmm0, %xmm2, %xmm0 +; orps %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -278,16 +281,17 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm2, %xmm4 -; blendvps %xmm0, %xmm1, %xmm4 -; movdqa %xmm4, %xmm0 +; movdqa %xmm1, %xmm4 +; andps %xmm0, %xmm4 +; andnps %xmm2, %xmm0 +; orps %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq -function %vselect_f64x2(i64x2, f64x2, f64x2) -> f64x2 { -block0(v0: i64x2, v1: f64x2, v2: f64x2): - v3 = vselect v0, v1, v2 +function %vselect_f64x2(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = bitselect v0, v1, v2 return v3 } @@ -295,9 +299,10 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movdqa %xmm2, %xmm4 -; blendvpd %xmm4, %xmm1, %xmm4 -; movdqa %xmm4, %xmm0 +; movdqa %xmm1, %xmm4 +; andpd %xmm4, %xmm0, %xmm4 +; andnpd %xmm0, %xmm2, %xmm0 +; orpd %xmm0, %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret @@ -307,9 +312,10 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2): ; pushq %rbp ; movq %rsp, %rbp ; block1: ; offset 0x4 -; movdqa %xmm2, %xmm4 -; blendvpd %xmm0, %xmm1, %xmm4 -; movdqa %xmm4, %xmm0 +; movdqa %xmm1, %xmm4 +; andpd %xmm0, %xmm4 +; andnpd %xmm2, %xmm0 +; orpd %xmm4, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; retq diff --git a/cranelift/filetests/filetests/runtests/simd-vselect.clif b/cranelift/filetests/filetests/runtests/simd-vselect.clif deleted file mode 100644 index 5d2ca1afe7..0000000000 --- a/cranelift/filetests/filetests/runtests/simd-vselect.clif +++ /dev/null @@ -1,82 +0,0 @@ -test interpret -test run -target s390x -target aarch64 -set enable_simd -target x86_64 has_sse3 has_ssse3 has_sse41 - -function %vselect_i8x16() -> i8x16 { -block0: - v1 = vconst.i8x16 [0 -1 0 -1 0 -1 -1 -1 -1 -1 0 0 0 0 0 0] - v2 = vconst.i8x16 [100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115] - v3 = vconst.i8x16 [200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215] - v4 = vselect v1, v2, v3 - return v4 -} -; run: %vselect_i8x16() == [200 101 202 103 204 105 106 107 108 109 210 211 212 213 214 215] - -function %vselect_i16x8() -> i16x8 { -block0: - v1 = vconst.i16x8 [0 -1 0 -1 0 -1 -1 -1] - v2 = vconst.i16x8 [100 101 102 103 104 105 106 107] - v3 = vconst.i16x8 [200 201 202 203 204 205 206 207] - v4 = vselect v1, v2, v3 - return v4 -} -; run: %vselect_i16x8() == [200 101 202 103 204 105 106 107] - -function %vselect_i32x4_const() -> i32x4 { -block0: - v1 = vconst.i32x4 [0 -1 0 -1] - v2 = vconst.i32x4 [100 101 102 103] - v3 = vconst.i32x4 [200 201 202 203] - v4 = vselect v1, v2, v3 - return v4 -} -; run: %vselect_i32x4_const() == [200 101 202 103] - -function %vselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 { -block0(v0: i32x4, v1: i32x4, v2: i32x4): - v3 = vselect v0, v1, v2 - return v3 -} -; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector. -; run: %vselect_i32x4([-1 -1 0 0], [1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4] - -function %vselect_i64x2() -> i64x2 { -block0: - v1 = vconst.i64x2 [0 -1] - v2 = vconst.i64x2 [100 101] - v3 = vconst.i64x2 [200 201] - v4 = vselect v1, v2, v3 - return v4 -} -; run: %vselect_i64x2() == [200 101] - -function %vselect_p_i8x16(i8x16, i8x16, i8x16) -> i8x16 { -block0(v0: i8x16, v1: i8x16, v2: i8x16): - v3 = vselect v0, v1, v2 - return v3 -} -; run: %vselect_p_i8x16([-1 0 -1 -1 -1 0 0 0 -1 0 -1 -1 -1 0 0 0], [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 18 3 4 5 22 23 24 9 26 11 12 13 30 31 32] - -function %vselect_p_i16x8(i16x8, i16x8, i16x8) -> i16x8 { -block0(v0: i16x8, v1: i16x8, v2: i16x8): - v3 = vselect v0, v1, v2 - return v3 -} -; run: %vselect_p_i16x8([-1 0 -1 -1 -1 0 0 0], [1 2 3 4 5 6 7 8], [17 18 19 20 21 22 23 24]) == [1 18 3 4 5 22 23 24] - -function %vselect_p_i32x4(i32x4, i32x4, i32x4) -> i32x4 { -block0(v0: i32x4, v1: i32x4, v2: i32x4): - v3 = vselect v0, v1, v2 - return v3 -} -; run: %vselect_p_i32x4([-1 0 -1 -1], [1 2 3 4], [100000 200000 300000 400000]) == [1 200000 3 4] - -function %vselect_p_i64x2(i64x2, i64x2, i64x2) -> i64x2 { -block0(v0: i64x2, v1: i64x2, v2: i64x2): - v3 = vselect v0, v1, v2 - return v3 -} -; run: %vselect_p_i64x2([-1 0], [1 2], [100000000000 200000000000]) == [1 200000000000] diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index 56a894fd62..51251b6d47 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -603,11 +603,7 @@ where Opcode::Select | Opcode::SelectSpectreGuard => { choose(arg(0)?.into_bool()?, arg(1)?, arg(2)?) } - Opcode::Bitselect => { - let mask_a = Value::and(arg(0)?, arg(1)?)?; - let mask_b = Value::and(Value::not(arg(0)?)?, arg(2)?)?; - assign(Value::or(mask_a, mask_b)?) - } + Opcode::Bitselect => assign(bitselect(arg(0)?, arg(1)?, arg(2)?)?), Opcode::Icmp => assign(icmp( ctrl_ty, inst.cond_code().unwrap(), @@ -623,7 +619,7 @@ where Opcode::Smin => { if ctrl_ty.is_vector() { let icmp = icmp(ctrl_ty, IntCC::SignedGreaterThan, &arg(1)?, &arg(0)?)?; - assign(vselect(&icmp, &arg(0)?, &arg(1)?, ctrl_ty)?) + assign(bitselect(icmp, arg(0)?, arg(1)?)?) } else { choose(Value::gt(&arg(1)?, &arg(0)?)?, arg(0)?, arg(1)?) } @@ -631,7 +627,7 @@ where Opcode::Umin => { if ctrl_ty.is_vector() { let icmp = icmp(ctrl_ty, IntCC::UnsignedGreaterThan, &arg(1)?, &arg(0)?)?; - assign(vselect(&icmp, &arg(0)?, &arg(1)?, ctrl_ty)?) + assign(bitselect(icmp, arg(0)?, arg(1)?)?) } else { choose( Value::gt( @@ -646,7 +642,7 @@ where Opcode::Smax => { if ctrl_ty.is_vector() { let icmp = icmp(ctrl_ty, IntCC::SignedGreaterThan, &arg(0)?, &arg(1)?)?; - assign(vselect(&icmp, &arg(0)?, &arg(1)?, ctrl_ty)?) + assign(bitselect(icmp, arg(0)?, arg(1)?)?) } else { choose(Value::gt(&arg(0)?, &arg(1)?)?, arg(0)?, arg(1)?) } @@ -654,7 +650,7 @@ where Opcode::Umax => { if ctrl_ty.is_vector() { let icmp = icmp(ctrl_ty, IntCC::UnsignedGreaterThan, &arg(0)?, &arg(1)?)?; - assign(vselect(&icmp, &arg(0)?, &arg(1)?, ctrl_ty)?) + assign(bitselect(icmp, arg(0)?, arg(1)?)?) } else { choose( Value::gt( @@ -1067,7 +1063,6 @@ where } assign(Value::int(result, ctrl_ty)?) } - Opcode::Vselect => assign(vselect(&arg(0)?, &arg(1)?, &arg(2)?, ctrl_ty)?), Opcode::VanyTrue => { let lane_ty = ctrl_ty.lane_type(); let init = V::bool(false, true, lane_ty)?; @@ -1641,20 +1636,11 @@ where vectorizelanes(&result, vector_type) } -fn vselect(c: &V, x: &V, y: &V, vector_type: types::Type) -> ValueResult +fn bitselect(c: V, x: V, y: V) -> ValueResult where V: Value, { - let c = extractlanes(c, vector_type)?; - let x = extractlanes(x, vector_type)?; - let y = extractlanes(y, vector_type)?; - let mut new_vec = SimdVec::new(); - for (c, (x, y)) in c.into_iter().zip(x.into_iter().zip(y.into_iter())) { - if Value::eq(&c, &Value::int(0, vector_type.lane_type())?)? { - new_vec.push(y); - } else { - new_vec.push(x); - } - } - vectorizelanes(&new_vec, vector_type) + let mask_x = Value::and(c.clone(), x)?; + let mask_y = Value::and(Value::not(c)?, y)?; + Value::or(mask_x, mask_y) } diff --git a/cranelift/interpreter/src/value.rs b/cranelift/interpreter/src/value.rs index 2262d6a06f..704a7002ab 100644 --- a/cranelift/interpreter/src/value.rs +++ b/cranelift/interpreter/src/value.rs @@ -218,6 +218,28 @@ macro_rules! binary_match { }; } +macro_rules! bitop { + ( $op:tt($arg1:expr, $arg2:expr) ) => { + Ok(match ($arg1, $arg2) { + (DataValue::I8(a), DataValue::I8(b)) => DataValue::I8(a $op b), + (DataValue::I16(a), DataValue::I16(b)) => DataValue::I16(a $op b), + (DataValue::I32(a), DataValue::I32(b)) => DataValue::I32(a $op b), + (DataValue::I64(a), DataValue::I64(b)) => DataValue::I64(a $op b), + (DataValue::I128(a), DataValue::I128(b)) => DataValue::I128(a $op b), + (DataValue::F32(a), DataValue::F32(b)) => DataValue::F32(a $op b), + (DataValue::F64(a), DataValue::F64(b)) => DataValue::F64(a $op b), + (DataValue::V128(a), DataValue::V128(b)) => { + let mut a2 = a.clone(); + for (a, b) in a2.iter_mut().zip(b.iter()) { + *a = *a $op *b; + } + DataValue::V128(a2) + } + _ => unimplemented!(), + }) + }; +} + impl Value for DataValue { fn ty(&self) -> Type { self.ty() @@ -686,19 +708,35 @@ impl Value for DataValue { } fn and(self, other: Self) -> ValueResult { - binary_match!(&(self, other); [I8, I16, I32, I64, I128, F32, F64]) + bitop!(&(self, other)) } fn or(self, other: Self) -> ValueResult { - binary_match!(|(self, other); [I8, I16, I32, I64, I128, F32, F64]) + bitop!(|(self, other)) } fn xor(self, other: Self) -> ValueResult { - binary_match!(^(self, other); [I8, I16, I32, I64, I128, F32, F64]) + bitop!(^(self, other)) } fn not(self) -> ValueResult { - unary_match!(!(self); [I8, I16, I32, I64, I128, F32, F64]) + Ok(match self { + DataValue::I8(a) => DataValue::I8(!a), + DataValue::I16(a) => DataValue::I16(!a), + DataValue::I32(a) => DataValue::I32(!a), + DataValue::I64(a) => DataValue::I64(!a), + DataValue::I128(a) => DataValue::I128(!a), + DataValue::F32(a) => DataValue::F32(!a), + DataValue::F64(a) => DataValue::F64(!a), + DataValue::V128(a) => { + let mut a2 = a.clone(); + for a in a2.iter_mut() { + *a = !*a; + } + DataValue::V128(a2) + } + _ => unimplemented!(), + }) } fn count_ones(self) -> ValueResult {