Remove the widening_pairwise_dot_product_s clif instruction (#5889)
This was added for the wasm SIMD proposal but I've been poking around at this recently and the instruction can instead be represented by its component parts with the same semantics I believe. This commit removes the instruction and instead represents it with the existing `iadd_pairwise` instruction (among others) and updates backends to with new pattern matches to have the same codegen as before. This interestingly entirely removed the codegen rule with no replacement on the AArch64 backend as the existing rules all existed to produce the same codegen.
This commit is contained in:
@@ -3134,41 +3134,6 @@ pub(crate) fn define(
|
|||||||
.operands_out(vec![a]),
|
.operands_out(vec![a]),
|
||||||
);
|
);
|
||||||
|
|
||||||
let I16x8 = &TypeVar::new(
|
|
||||||
"I16x8",
|
|
||||||
"A SIMD vector type containing 8 integer lanes each 16 bits wide.",
|
|
||||||
TypeSetBuilder::new()
|
|
||||||
.ints(16..16)
|
|
||||||
.simd_lanes(8..8)
|
|
||||||
.includes_scalars(false)
|
|
||||||
.build(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let x = &Operand::new("x", I16x8);
|
|
||||||
let y = &Operand::new("y", I16x8);
|
|
||||||
let a = &Operand::new("a", &I16x8.merge_lanes());
|
|
||||||
|
|
||||||
ig.push(
|
|
||||||
Inst::new(
|
|
||||||
"widening_pairwise_dot_product_s",
|
|
||||||
r#"
|
|
||||||
Takes corresponding elements in `x` and `y`, performs a sign-extending length-doubling
|
|
||||||
multiplication on them, then adds adjacent pairs of elements to form the result. For
|
|
||||||
example, if the input vectors are `[x3, x2, x1, x0]` and `[y3, y2, y1, y0]`, it produces
|
|
||||||
the vector `[r1, r0]`, where `r1 = sx(x3) * sx(y3) + sx(x2) * sx(y2)` and
|
|
||||||
`r0 = sx(x1) * sx(y1) + sx(x0) * sx(y0)`, and `sx(n)` sign-extends `n` to twice its width.
|
|
||||||
|
|
||||||
This will double the lane width and halve the number of lanes. So the resulting
|
|
||||||
vector has the same number of bits as `x` and `y` do (individually).
|
|
||||||
|
|
||||||
See <https://github.com/WebAssembly/simd/pull/127> for background info.
|
|
||||||
"#,
|
|
||||||
&formats.binary,
|
|
||||||
)
|
|
||||||
.operands_in(vec![x, y])
|
|
||||||
.operands_out(vec![a]),
|
|
||||||
);
|
|
||||||
|
|
||||||
let IntTo = &TypeVar::new(
|
let IntTo = &TypeVar::new(
|
||||||
"IntTo",
|
"IntTo",
|
||||||
"A larger integer type with the same number of lanes",
|
"A larger integer type with the same number of lanes",
|
||||||
|
|||||||
@@ -204,15 +204,19 @@
|
|||||||
|
|
||||||
;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction
|
||||||
(rule (lower (has_type $I16X8 (iadd_pairwise (swiden_low x) (swiden_high x))))
|
(rule (lower (has_type $I16X8 (iadd_pairwise (swiden_low x) (swiden_high x))))
|
||||||
(saddlp8 x))
|
(saddlp8 x))
|
||||||
|
|
||||||
|
;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction
|
||||||
(rule (lower (has_type $I32X4 (iadd_pairwise (swiden_low x) (swiden_high x))))
|
(rule (lower (has_type $I32X4 (iadd_pairwise (swiden_low x) (swiden_high x))))
|
||||||
(saddlp16 x))
|
(saddlp16 x))
|
||||||
|
|
||||||
|
;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction
|
||||||
(rule (lower (has_type $I16X8 (iadd_pairwise (uwiden_low x) (uwiden_high x))))
|
(rule (lower (has_type $I16X8 (iadd_pairwise (uwiden_low x) (uwiden_high x))))
|
||||||
(uaddlp8 x))
|
(uaddlp8 x))
|
||||||
|
|
||||||
|
;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction
|
||||||
(rule (lower (has_type $I32X4 (iadd_pairwise (uwiden_low x) (uwiden_high x))))
|
(rule (lower (has_type $I32X4 (iadd_pairwise (uwiden_low x) (uwiden_high x))))
|
||||||
(uaddlp16 x))
|
(uaddlp16 x))
|
||||||
|
|
||||||
@@ -2030,18 +2034,6 @@
|
|||||||
(let ((tmp Reg (fpu_move_from_vec x 1 (VectorSize.Size32x2))))
|
(let ((tmp Reg (fpu_move_from_vec x 1 (VectorSize.Size32x2))))
|
||||||
(vec_extend (VecExtendOp.Uxtl) tmp $false (lane_size ty))))
|
(vec_extend (VecExtendOp.Uxtl) tmp $false (lane_size ty))))
|
||||||
|
|
||||||
;;;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
|
|
||||||
;; The args have type I16X8.
|
|
||||||
;; "dst = i32x4.dot_i16x8_s(x, y)"
|
|
||||||
;; => smull tmp, x, y
|
|
||||||
;; smull2 dst, x, y
|
|
||||||
;; addp dst, tmp, dst
|
|
||||||
(rule (lower (has_type $I32X4 (widening_pairwise_dot_product_s x y)))
|
|
||||||
(let ((tmp Reg (vec_rrr_long (VecRRRLongOp.Smull16) x y $false))
|
|
||||||
(dst Reg (vec_rrr_long (VecRRRLongOp.Smull16) x y $true)))
|
|
||||||
(vec_rrr (VecALUOp.Addp) tmp dst (VectorSize.Size32x4))))
|
|
||||||
|
|
||||||
;;;; Rules for `Fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Rules for `Fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(rule (lower (fence))
|
(rule (lower (fence))
|
||||||
|
|||||||
@@ -138,6 +138,14 @@
|
|||||||
(vec_add ty x (vec_lshr_by_byte x size))
|
(vec_add ty x (vec_lshr_by_byte x size))
|
||||||
(vec_add ty y (vec_lshr_by_byte y size)))))
|
(vec_add ty y (vec_lshr_by_byte y size)))))
|
||||||
|
|
||||||
|
;; special case for the `i32x4.dot_i16x8_s` wasm instruction
|
||||||
|
(rule 1 (lower
|
||||||
|
(has_type dst_ty (iadd_pairwise
|
||||||
|
(imul (swiden_low x @ (value_type src_ty)) (swiden_low y))
|
||||||
|
(imul (swiden_high x) (swiden_high y)))))
|
||||||
|
(vec_add dst_ty (vec_smul_even src_ty x y)
|
||||||
|
(vec_smul_odd src_ty x y)))
|
||||||
|
|
||||||
|
|
||||||
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
@@ -484,15 +492,6 @@
|
|||||||
(mov_to_vec128 $I64X2 res_0 res_1)))
|
(mov_to_vec128 $I64X2 res_0 res_1)))
|
||||||
|
|
||||||
|
|
||||||
;;;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
|
|
||||||
;; Widening pairwise dot product of two vector registers.
|
|
||||||
(rule (lower (has_type dst_ty (widening_pairwise_dot_product_s
|
|
||||||
x @ (value_type src_ty) y)))
|
|
||||||
(vec_add dst_ty (vec_smul_even src_ty x y)
|
|
||||||
(vec_smul_odd src_ty x y)))
|
|
||||||
|
|
||||||
|
|
||||||
;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
;; Fixed-point multiplication of two vector registers.
|
;; Fixed-point multiplication of two vector registers.
|
||||||
|
|||||||
@@ -2147,12 +2147,6 @@
|
|||||||
(rule (lower (debugtrap))
|
(rule (lower (debugtrap))
|
||||||
(side_effect (x64_hlt)))
|
(side_effect (x64_hlt)))
|
||||||
|
|
||||||
;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
|
|
||||||
(rule (lower (has_type $I32X4
|
|
||||||
(widening_pairwise_dot_product_s x y)))
|
|
||||||
(x64_pmaddwd x y))
|
|
||||||
|
|
||||||
;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(rule (lower (has_type $F32 (fadd x y)))
|
(rule (lower (has_type $F32 (fadd x y)))
|
||||||
@@ -3179,6 +3173,7 @@
|
|||||||
|
|
||||||
;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction
|
||||||
(rule (lower
|
(rule (lower
|
||||||
(has_type $I16X8 (iadd_pairwise
|
(has_type $I16X8 (iadd_pairwise
|
||||||
(swiden_low val @ (value_type $I8X16))
|
(swiden_low val @ (value_type $I8X16))
|
||||||
@@ -3186,6 +3181,7 @@
|
|||||||
(let ((mul_const Xmm (x64_xmm_load_const $I8X16 (iadd_pairwise_mul_const_16))))
|
(let ((mul_const Xmm (x64_xmm_load_const $I8X16 (iadd_pairwise_mul_const_16))))
|
||||||
(x64_pmaddubsw mul_const val)))
|
(x64_pmaddubsw mul_const val)))
|
||||||
|
|
||||||
|
;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction
|
||||||
(rule (lower
|
(rule (lower
|
||||||
(has_type $I32X4 (iadd_pairwise
|
(has_type $I32X4 (iadd_pairwise
|
||||||
(swiden_low val @ (value_type $I16X8))
|
(swiden_low val @ (value_type $I16X8))
|
||||||
@@ -3193,6 +3189,7 @@
|
|||||||
(let ((mul_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_mul_const_32))))
|
(let ((mul_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_mul_const_32))))
|
||||||
(x64_pmaddwd val mul_const)))
|
(x64_pmaddwd val mul_const)))
|
||||||
|
|
||||||
|
;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction
|
||||||
(rule (lower
|
(rule (lower
|
||||||
(has_type $I16X8 (iadd_pairwise
|
(has_type $I16X8 (iadd_pairwise
|
||||||
(uwiden_low val @ (value_type $I8X16))
|
(uwiden_low val @ (value_type $I8X16))
|
||||||
@@ -3200,6 +3197,7 @@
|
|||||||
(let ((mul_const Xmm (x64_xmm_load_const $I8X16 (iadd_pairwise_mul_const_16))))
|
(let ((mul_const Xmm (x64_xmm_load_const $I8X16 (iadd_pairwise_mul_const_16))))
|
||||||
(x64_pmaddubsw val mul_const)))
|
(x64_pmaddubsw val mul_const)))
|
||||||
|
|
||||||
|
;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction
|
||||||
(rule (lower
|
(rule (lower
|
||||||
(has_type $I32X4 (iadd_pairwise
|
(has_type $I32X4 (iadd_pairwise
|
||||||
(uwiden_low val @ (value_type $I16X8))
|
(uwiden_low val @ (value_type $I16X8))
|
||||||
@@ -3213,6 +3211,13 @@
|
|||||||
(addd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_addd_const_32))))
|
(addd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_addd_const_32))))
|
||||||
(x64_paddd dst addd_const)))
|
(x64_paddd dst addd_const)))
|
||||||
|
|
||||||
|
;; special case for the `i32x4.dot_i16x8_s` wasm instruction
|
||||||
|
(rule (lower
|
||||||
|
(has_type $I32X4 (iadd_pairwise
|
||||||
|
(imul (swiden_low x) (swiden_low y))
|
||||||
|
(imul (swiden_high x) (swiden_high y)))))
|
||||||
|
(x64_pmaddwd x y))
|
||||||
|
|
||||||
;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(rule (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16))))
|
(rule (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16))))
|
||||||
|
|||||||
@@ -1295,8 +1295,14 @@ block0(v0: i8x16, v1: i8x16):
|
|||||||
|
|
||||||
function %widening_pairwise_dot_product_s_i16x8(i16x8, i16x8) -> i32x4 {
|
function %widening_pairwise_dot_product_s_i16x8(i16x8, i16x8) -> i32x4 {
|
||||||
block0(v0: i16x8, v1: i16x8):
|
block0(v0: i16x8, v1: i16x8):
|
||||||
v2 = widening_pairwise_dot_product_s v0, v1
|
v2 = swiden_low v0
|
||||||
return v2
|
v3 = swiden_low v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
v5 = swiden_high v0
|
||||||
|
v6 = swiden_high v1
|
||||||
|
v7 = imul v5, v6
|
||||||
|
v8 = iadd_pairwise v4, v7
|
||||||
|
return v8
|
||||||
}
|
}
|
||||||
|
|
||||||
; VCode:
|
; VCode:
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
test interpret
|
|
||||||
test run
|
test run
|
||||||
target aarch64
|
target aarch64
|
||||||
target s390x
|
target s390x
|
||||||
@@ -7,8 +6,14 @@ target x86_64 has_sse3 has_ssse3 has_sse41
|
|||||||
|
|
||||||
function %wpdps(i16x8, i16x8) -> i32x4 {
|
function %wpdps(i16x8, i16x8) -> i32x4 {
|
||||||
block0(v0: i16x8, v1: i16x8):
|
block0(v0: i16x8, v1: i16x8):
|
||||||
v2 = widening_pairwise_dot_product_s v0, v1
|
v2 = swiden_low v0
|
||||||
return v2
|
v3 = swiden_low v1
|
||||||
|
v4 = imul v2, v3
|
||||||
|
v5 = swiden_high v0
|
||||||
|
v6 = swiden_high v1
|
||||||
|
v7 = imul v5, v6
|
||||||
|
v8 = iadd_pairwise v4, v7
|
||||||
|
return v8
|
||||||
}
|
}
|
||||||
; run: %wpdps([1 2 3 4 5 6 7 8], [8000 7000 6000 5000 4000 3000 2000 1000]) == [22000 38000 38000 22000]
|
; run: %wpdps([1 2 3 4 5 6 7 8], [8000 7000 6000 5000 4000 3000 2000 1000]) == [22000 38000 38000 22000]
|
||||||
; run: %wpdps([1 -2 3 -4 5 -6 7 -8], [32767 32767 32767 32767 -32768 -32768 -32768 -32768]) == [-32767 -32767 32768 32768]
|
; run: %wpdps([1 -2 3 -4 5 -6 7 -8], [32767 32767 32767 32767 -32768 -32768 -32768 -32768]) == [-32767 -32767 32768 32768]
|
||||||
|
|||||||
@@ -1308,26 +1308,6 @@ where
|
|||||||
// actually need to emit a fence here.
|
// actually need to emit a fence here.
|
||||||
ControlFlow::Continue
|
ControlFlow::Continue
|
||||||
}
|
}
|
||||||
Opcode::WideningPairwiseDotProductS => {
|
|
||||||
let ctrl_ty = types::I16X8;
|
|
||||||
let new_type = ctrl_ty.merge_lanes().unwrap();
|
|
||||||
let arg0 = extractlanes(&arg(0)?, ctrl_ty)?;
|
|
||||||
let arg1 = extractlanes(&arg(1)?, ctrl_ty)?;
|
|
||||||
let new_vec = arg0
|
|
||||||
.chunks(2)
|
|
||||||
.into_iter()
|
|
||||||
.zip(arg1.chunks(2))
|
|
||||||
.into_iter()
|
|
||||||
.map(|(x, y)| {
|
|
||||||
let mut z = 0i128;
|
|
||||||
for (lhs, rhs) in x.into_iter().zip(y.into_iter()) {
|
|
||||||
z += lhs.clone().into_int()? * rhs.clone().into_int()?;
|
|
||||||
}
|
|
||||||
Value::int(z, new_type.lane_type())
|
|
||||||
})
|
|
||||||
.collect::<ValueResult<Vec<_>>>()?;
|
|
||||||
assign(vectorizelanes(&new_vec, new_type)?)
|
|
||||||
}
|
|
||||||
Opcode::SqmulRoundSat => {
|
Opcode::SqmulRoundSat => {
|
||||||
let lane_type = ctrl_ty.lane_type();
|
let lane_type = ctrl_ty.lane_type();
|
||||||
let double_width = ctrl_ty.double_width().unwrap().lane_type();
|
let double_width = ctrl_ty.double_width().unwrap().lane_type();
|
||||||
|
|||||||
@@ -2059,7 +2059,13 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
}
|
}
|
||||||
Operator::I32x4DotI16x8S => {
|
Operator::I32x4DotI16x8S => {
|
||||||
let (a, b) = pop2_with_bitcast(state, I16X8, builder);
|
let (a, b) = pop2_with_bitcast(state, I16X8, builder);
|
||||||
state.push1(builder.ins().widening_pairwise_dot_product_s(a, b));
|
let alow = builder.ins().swiden_low(a);
|
||||||
|
let blow = builder.ins().swiden_low(b);
|
||||||
|
let low = builder.ins().imul(alow, blow);
|
||||||
|
let ahigh = builder.ins().swiden_high(a);
|
||||||
|
let bhigh = builder.ins().swiden_high(b);
|
||||||
|
let high = builder.ins().imul(ahigh, bhigh);
|
||||||
|
state.push1(builder.ins().iadd_pairwise(low, high));
|
||||||
}
|
}
|
||||||
Operator::I8x16Popcnt => {
|
Operator::I8x16Popcnt => {
|
||||||
let arg = pop1_with_bitcast(state, type_of(op), builder);
|
let arg = pop1_with_bitcast(state, type_of(op), builder);
|
||||||
|
|||||||
Reference in New Issue
Block a user