diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 56a135072a..a54a9818ca 100755 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -3134,41 +3134,6 @@ pub(crate) fn define( .operands_out(vec![a]), ); - let I16x8 = &TypeVar::new( - "I16x8", - "A SIMD vector type containing 8 integer lanes each 16 bits wide.", - TypeSetBuilder::new() - .ints(16..16) - .simd_lanes(8..8) - .includes_scalars(false) - .build(), - ); - - let x = &Operand::new("x", I16x8); - let y = &Operand::new("y", I16x8); - let a = &Operand::new("a", &I16x8.merge_lanes()); - - ig.push( - Inst::new( - "widening_pairwise_dot_product_s", - r#" - Takes corresponding elements in `x` and `y`, performs a sign-extending length-doubling - multiplication on them, then adds adjacent pairs of elements to form the result. For - example, if the input vectors are `[x3, x2, x1, x0]` and `[y3, y2, y1, y0]`, it produces - the vector `[r1, r0]`, where `r1 = sx(x3) * sx(y3) + sx(x2) * sx(y2)` and - `r0 = sx(x1) * sx(y1) + sx(x0) * sx(y0)`, and `sx(n)` sign-extends `n` to twice its width. - - This will double the lane width and halve the number of lanes. So the resulting - vector has the same number of bits as `x` and `y` do (individually). - - See for background info. - "#, - &formats.binary, - ) - .operands_in(vec![x, y]) - .operands_out(vec![a]), - ); - let IntTo = &TypeVar::new( "IntTo", "A larger integer type with the same number of lanes", diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index fca811ea82..3eec28c118 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -204,15 +204,19 @@ ;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction (rule (lower (has_type $I16X8 (iadd_pairwise (swiden_low x) (swiden_high x)))) (saddlp8 x)) +;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction (rule (lower (has_type $I32X4 (iadd_pairwise (swiden_low x) (swiden_high x)))) (saddlp16 x)) +;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction (rule (lower (has_type $I16X8 (iadd_pairwise (uwiden_low x) (uwiden_high x)))) (uaddlp8 x)) +;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction (rule (lower (has_type $I32X4 (iadd_pairwise (uwiden_low x) (uwiden_high x)))) (uaddlp16 x)) @@ -2030,18 +2034,6 @@ (let ((tmp Reg (fpu_move_from_vec x 1 (VectorSize.Size32x2)))) (vec_extend (VecExtendOp.Uxtl) tmp $false (lane_size ty)))) -;;;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;; - -;; The args have type I16X8. -;; "dst = i32x4.dot_i16x8_s(x, y)" -;; => smull tmp, x, y -;; smull2 dst, x, y -;; addp dst, tmp, dst -(rule (lower (has_type $I32X4 (widening_pairwise_dot_product_s x y))) - (let ((tmp Reg (vec_rrr_long (VecRRRLongOp.Smull16) x y $false)) - (dst Reg (vec_rrr_long (VecRRRLongOp.Smull16) x y $true))) - (vec_rrr (VecALUOp.Addp) tmp dst (VectorSize.Size32x4)))) - ;;;; Rules for `Fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (fence)) diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index ed725d1dc3..84e10b4fda 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -138,6 +138,14 @@ (vec_add ty x (vec_lshr_by_byte x size)) (vec_add ty y (vec_lshr_by_byte y size))))) +;; special case for the `i32x4.dot_i16x8_s` wasm instruction +(rule 1 (lower + (has_type dst_ty (iadd_pairwise + (imul (swiden_low x @ (value_type src_ty)) (swiden_low y)) + (imul (swiden_high x) (swiden_high y))))) + (vec_add dst_ty (vec_smul_even src_ty x y) + (vec_smul_odd src_ty x y))) + ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -484,15 +492,6 @@ (mov_to_vec128 $I64X2 res_0 res_1))) -;;;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; Widening pairwise dot product of two vector registers. -(rule (lower (has_type dst_ty (widening_pairwise_dot_product_s - x @ (value_type src_ty) y))) - (vec_add dst_ty (vec_smul_even src_ty x y) - (vec_smul_odd src_ty x y))) - - ;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Fixed-point multiplication of two vector registers. diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index dcc8bd1f0f..84f9f3ef80 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2147,12 +2147,6 @@ (rule (lower (debugtrap)) (side_effect (x64_hlt))) -;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule (lower (has_type $I32X4 - (widening_pairwise_dot_product_s x y))) - (x64_pmaddwd x y)) - ;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $F32 (fadd x y))) @@ -3179,6 +3173,7 @@ ;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction (rule (lower (has_type $I16X8 (iadd_pairwise (swiden_low val @ (value_type $I8X16)) @@ -3186,6 +3181,7 @@ (let ((mul_const Xmm (x64_xmm_load_const $I8X16 (iadd_pairwise_mul_const_16)))) (x64_pmaddubsw mul_const val))) +;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction (rule (lower (has_type $I32X4 (iadd_pairwise (swiden_low val @ (value_type $I16X8)) @@ -3193,6 +3189,7 @@ (let ((mul_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_mul_const_32)))) (x64_pmaddwd val mul_const))) +;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction (rule (lower (has_type $I16X8 (iadd_pairwise (uwiden_low val @ (value_type $I8X16)) @@ -3200,6 +3197,7 @@ (let ((mul_const Xmm (x64_xmm_load_const $I8X16 (iadd_pairwise_mul_const_16)))) (x64_pmaddubsw val mul_const))) +;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction (rule (lower (has_type $I32X4 (iadd_pairwise (uwiden_low val @ (value_type $I16X8)) @@ -3213,6 +3211,13 @@ (addd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_addd_const_32)))) (x64_paddd dst addd_const))) +;; special case for the `i32x4.dot_i16x8_s` wasm instruction +(rule (lower + (has_type $I32X4 (iadd_pairwise + (imul (swiden_low x) (swiden_low y)) + (imul (swiden_high x) (swiden_high y))))) + (x64_pmaddwd x y)) + ;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16)))) diff --git a/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif index bb8da42fe5..960477cbbd 100644 --- a/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif +++ b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic.clif @@ -1295,8 +1295,14 @@ block0(v0: i8x16, v1: i8x16): function %widening_pairwise_dot_product_s_i16x8(i16x8, i16x8) -> i32x4 { block0(v0: i16x8, v1: i16x8): - v2 = widening_pairwise_dot_product_s v0, v1 - return v2 + v2 = swiden_low v0 + v3 = swiden_low v1 + v4 = imul v2, v3 + v5 = swiden_high v0 + v6 = swiden_high v1 + v7 = imul v5, v6 + v8 = iadd_pairwise v4, v7 + return v8 } ; VCode: diff --git a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif index c38099c429..205f431811 100644 --- a/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif +++ b/cranelift/filetests/filetests/runtests/simd-wideningpairwisedotproducts.clif @@ -1,4 +1,3 @@ -test interpret test run target aarch64 target s390x @@ -7,8 +6,14 @@ target x86_64 has_sse3 has_ssse3 has_sse41 function %wpdps(i16x8, i16x8) -> i32x4 { block0(v0: i16x8, v1: i16x8): - v2 = widening_pairwise_dot_product_s v0, v1 - return v2 + v2 = swiden_low v0 + v3 = swiden_low v1 + v4 = imul v2, v3 + v5 = swiden_high v0 + v6 = swiden_high v1 + v7 = imul v5, v6 + v8 = iadd_pairwise v4, v7 + return v8 } ; run: %wpdps([1 2 3 4 5 6 7 8], [8000 7000 6000 5000 4000 3000 2000 1000]) == [22000 38000 38000 22000] ; run: %wpdps([1 -2 3 -4 5 -6 7 -8], [32767 32767 32767 32767 -32768 -32768 -32768 -32768]) == [-32767 -32767 32768 32768] diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index e64d90838a..d4da0a2d2c 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -1308,26 +1308,6 @@ where // actually need to emit a fence here. ControlFlow::Continue } - Opcode::WideningPairwiseDotProductS => { - let ctrl_ty = types::I16X8; - let new_type = ctrl_ty.merge_lanes().unwrap(); - let arg0 = extractlanes(&arg(0)?, ctrl_ty)?; - let arg1 = extractlanes(&arg(1)?, ctrl_ty)?; - let new_vec = arg0 - .chunks(2) - .into_iter() - .zip(arg1.chunks(2)) - .into_iter() - .map(|(x, y)| { - let mut z = 0i128; - for (lhs, rhs) in x.into_iter().zip(y.into_iter()) { - z += lhs.clone().into_int()? * rhs.clone().into_int()?; - } - Value::int(z, new_type.lane_type()) - }) - .collect::>>()?; - assign(vectorizelanes(&new_vec, new_type)?) - } Opcode::SqmulRoundSat => { let lane_type = ctrl_ty.lane_type(); let double_width = ctrl_ty.double_width().unwrap().lane_type(); diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index fc5c05c66d..94301f9327 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -2059,7 +2059,13 @@ pub fn translate_operator( } Operator::I32x4DotI16x8S => { let (a, b) = pop2_with_bitcast(state, I16X8, builder); - state.push1(builder.ins().widening_pairwise_dot_product_s(a, b)); + let alow = builder.ins().swiden_low(a); + let blow = builder.ins().swiden_low(b); + let low = builder.ins().imul(alow, blow); + let ahigh = builder.ins().swiden_high(a); + let bhigh = builder.ins().swiden_high(b); + let high = builder.ins().imul(ahigh, bhigh); + state.push1(builder.ins().iadd_pairwise(low, high)); } Operator::I8x16Popcnt => { let arg = pop1_with_bitcast(state, type_of(op), builder);