diff --git a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle index e97c398d2f..bf99f57f71 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle +++ b/cranelift/codegen/src/isa/aarch64/lower_dynamic_neon.isle @@ -99,22 +99,12 @@ (rule (lower (extract_vector x 0)) (value_reg (fpu_move_128 (put_in_reg x)))) -;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule -1 (lower (has_type ty (swiden_low x))) - (value_reg (vec_extend (VecExtendOp.Sxtl) x $false (lane_size ty)))) - ;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -1 (lower (has_type ty (swiden_high x))) - (value_reg (vec_extend (VecExtendOp.Sxtl) x $true (lane_size ty)))) - -;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(rule -1 (lower (has_type ty (uwiden_low x))) - (value_reg (vec_extend (VecExtendOp.Uxtl) x $false (lane_size ty)))) + (vec_extend (VecExtendOp.Sxtl) x $true (lane_size ty))) ;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule -1 (lower (has_type ty (uwiden_high x))) - (value_reg (vec_extend (VecExtendOp.Uxtl) x $true (lane_size ty)))) + (vec_extend (VecExtendOp.Uxtl) x $true (lane_size ty))) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 14b07ff141..5abc503259 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -905,10 +905,10 @@ ;; (No i8x16 multiply.) -(rule 1 (lower (has_type (multi_lane 16 8) (imul x y))) +(rule (lower (has_type (multi_lane 16 8) (imul x y))) (x64_pmullw x y)) -(rule 1 (lower (has_type (multi_lane 32 4) (imul x y))) +(rule (lower (has_type (multi_lane 32 4) (imul x y))) (x64_pmulld x y)) ;; With AVX-512 we can implement `i64x2` multiplication with a single @@ -939,7 +939,7 @@ ;; the lane of the destination. For this reason we don't need shifts to isolate ;; the lower 32-bits, however, we will need to use shifts to isolate the high ;; 32-bits when doing calculations, i.e., `Ah == A >> 32`. -(rule 1 (lower (has_type (multi_lane 64 2) +(rule (lower (has_type (multi_lane 64 2) (imul a b))) (let ((a0 Xmm a) (b0 Xmm b) @@ -961,7 +961,7 @@ (x64_paddq al_bl aa_bb_shifted))) ;; Special case for `i16x8.extmul_high_i8x16_s`. -(rule (lower (has_type (multi_lane 16 8) +(rule 1 (lower (has_type (multi_lane 16 8) (imul (swiden_high (and (value_type (multi_lane 8 16)) x)) (swiden_high (and (value_type (multi_lane 8 16)) @@ -975,7 +975,7 @@ (x64_pmullw x3 y3))) ;; Special case for `i32x4.extmul_high_i16x8_s`. -(rule (lower (has_type (multi_lane 32 4) +(rule 1 (lower (has_type (multi_lane 32 4) (imul (swiden_high (and (value_type (multi_lane 16 8)) x)) (swiden_high (and (value_type (multi_lane 16 8)) @@ -987,7 +987,7 @@ (x64_punpckhwd lo hi))) ;; Special case for `i64x2.extmul_high_i32x4_s`. -(rule (lower (has_type (multi_lane 64 2) +(rule 1 (lower (has_type (multi_lane 64 2) (imul (swiden_high (and (value_type (multi_lane 32 4)) x)) (swiden_high (and (value_type (multi_lane 32 4)) @@ -1001,7 +1001,7 @@ (x64_pmuldq x2 y2))) ;; Special case for `i16x8.extmul_low_i8x16_s`. -(rule (lower (has_type (multi_lane 16 8) +(rule 1 (lower (has_type (multi_lane 16 8) (imul (swiden_low (and (value_type (multi_lane 8 16)) x)) (swiden_low (and (value_type (multi_lane 8 16)) @@ -1011,7 +1011,7 @@ (x64_pmullw x2 y2))) ;; Special case for `i32x4.extmul_low_i16x8_s`. -(rule (lower (has_type (multi_lane 32 4) +(rule 1 (lower (has_type (multi_lane 32 4) (imul (swiden_low (and (value_type (multi_lane 16 8)) x)) (swiden_low (and (value_type (multi_lane 16 8)) @@ -1023,7 +1023,7 @@ (x64_punpcklwd lo hi))) ;; Special case for `i64x2.extmul_low_i32x4_s`. -(rule (lower (has_type (multi_lane 64 2) +(rule 1 (lower (has_type (multi_lane 64 2) (imul (swiden_low (and (value_type (multi_lane 32 4)) x)) (swiden_low (and (value_type (multi_lane 32 4)) @@ -1037,7 +1037,7 @@ (x64_pmuldq x2 y2))) ;; Special case for `i16x8.extmul_high_i8x16_u`. -(rule (lower (has_type (multi_lane 16 8) +(rule 1 (lower (has_type (multi_lane 16 8) (imul (uwiden_high (and (value_type (multi_lane 8 16)) x)) (uwiden_high (and (value_type (multi_lane 8 16)) @@ -1051,7 +1051,7 @@ (x64_pmullw x3 y3))) ;; Special case for `i32x4.extmul_high_i16x8_u`. -(rule (lower (has_type (multi_lane 32 4) +(rule 1 (lower (has_type (multi_lane 32 4) (imul (uwiden_high (and (value_type (multi_lane 16 8)) x)) (uwiden_high (and (value_type (multi_lane 16 8)) @@ -1063,7 +1063,7 @@ (x64_punpckhwd lo hi))) ;; Special case for `i64x2.extmul_high_i32x4_u`. -(rule (lower (has_type (multi_lane 64 2) +(rule 1 (lower (has_type (multi_lane 64 2) (imul (uwiden_high (and (value_type (multi_lane 32 4)) x)) (uwiden_high (and (value_type (multi_lane 32 4)) @@ -1077,7 +1077,7 @@ (x64_pmuludq x2 y2))) ;; Special case for `i16x8.extmul_low_i8x16_u`. -(rule (lower (has_type (multi_lane 16 8) +(rule 1 (lower (has_type (multi_lane 16 8) (imul (uwiden_low (and (value_type (multi_lane 8 16)) x)) (uwiden_low (and (value_type (multi_lane 8 16)) @@ -1087,7 +1087,7 @@ (x64_pmullw x2 y2))) ;; Special case for `i32x4.extmul_low_i16x8_u`. -(rule (lower (has_type (multi_lane 32 4) +(rule 1 (lower (has_type (multi_lane 32 4) (imul (uwiden_low (and (value_type (multi_lane 16 8)) x)) (uwiden_low (and (value_type (multi_lane 16 8)) @@ -1099,7 +1099,7 @@ (x64_punpcklwd lo hi))) ;; Special case for `i64x2.extmul_low_i32x4_u`. -(rule (lower (has_type (multi_lane 64 2) +(rule 1 (lower (has_type (multi_lane 64 2) (imul (uwiden_low (and (value_type (multi_lane 32 4)) x)) (uwiden_low (and (value_type (multi_lane 32 4)) diff --git a/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif b/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif new file mode 100644 index 0000000000..daa5ad5ef3 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-widen-mul.clif @@ -0,0 +1,246 @@ + + +test compile precise-output +set enable_simd +target x86_64 + +function %imul_swiden_hi_i8x16(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = swiden_high v0 + v3 = swiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm3 +; palignr $8, %xmm3, %xmm0, %xmm3 +; pmovsxbw %xmm3, %xmm0 +; movdqa %xmm1, %xmm7 +; palignr $8, %xmm7, %xmm1, %xmm7 +; pmovsxbw %xmm7, %xmm9 +; pmullw %xmm0, %xmm9, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_swiden_hi_i16x8(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = swiden_high v0 + v3 = swiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm5 +; pmullw %xmm5, %xmm1, %xmm5 +; movdqa %xmm5, %xmm6 +; movdqa %xmm0, %xmm5 +; pmulhw %xmm5, %xmm1, %xmm5 +; movdqa %xmm6, %xmm0 +; punpckhwd %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_swiden_hi_i32x4(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = swiden_high v0 + v3 = swiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $250, %xmm0, %xmm0 +; pshufd $250, %xmm1, %xmm5 +; pmuldq %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_swiden_low_i8x16(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = swiden_low v0 + v3 = swiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovsxbw %xmm0, %xmm0 +; pmovsxbw %xmm1, %xmm5 +; pmullw %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_swiden_low_i16x8(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = swiden_low v0 + v3 = swiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm5 +; pmullw %xmm5, %xmm1, %xmm5 +; movdqa %xmm5, %xmm6 +; movdqa %xmm0, %xmm5 +; pmulhw %xmm5, %xmm1, %xmm5 +; movdqa %xmm6, %xmm0 +; punpcklwd %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_swiden_low_i32x4(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = swiden_low v0 + v3 = swiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $80, %xmm0, %xmm0 +; pshufd $80, %xmm1, %xmm5 +; pmuldq %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_uwiden_hi_i8x16(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = uwiden_high v0 + v3 = uwiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm3 +; palignr $8, %xmm3, %xmm0, %xmm3 +; pmovzxbw %xmm3, %xmm0 +; movdqa %xmm1, %xmm7 +; palignr $8, %xmm7, %xmm1, %xmm7 +; pmovzxbw %xmm7, %xmm9 +; pmullw %xmm0, %xmm9, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_uwiden_hi_i16x8(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = uwiden_high v0 + v3 = uwiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm5 +; pmullw %xmm5, %xmm1, %xmm5 +; movdqa %xmm5, %xmm6 +; movdqa %xmm0, %xmm5 +; pmulhuw %xmm5, %xmm1, %xmm5 +; movdqa %xmm6, %xmm0 +; punpckhwd %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_uwiden_hi_i32x4(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = uwiden_high v0 + v3 = uwiden_high v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $250, %xmm0, %xmm0 +; pshufd $250, %xmm1, %xmm5 +; pmuludq %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_uwiden_low_i8x16(i8x16, i8x16) -> i16x8 { +block0(v0: i8x16, v1: i8x16): + v2 = uwiden_low v0 + v3 = uwiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovzxbw %xmm0, %xmm0 +; pmovzxbw %xmm1, %xmm5 +; pmullw %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_uwiden_low_i16x8(i16x8, i16x8) -> i32x4 { +block0(v0: i16x8, v1: i16x8): + v2 = uwiden_low v0 + v3 = uwiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm5 +; pmullw %xmm5, %xmm1, %xmm5 +; movdqa %xmm5, %xmm6 +; movdqa %xmm0, %xmm5 +; pmulhuw %xmm5, %xmm1, %xmm5 +; movdqa %xmm6, %xmm0 +; punpcklwd %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %imul_uwiden_low_i32x4(i32x4, i32x4) -> i64x2 { +block0(v0: i32x4, v1: i32x4): + v2 = uwiden_low v0 + v3 = uwiden_low v1 + v4 = imul v2, v3 + return v4 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pshufd $80, %xmm0, %xmm0 +; pshufd $80, %xmm1, %xmm5 +; pmuludq %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +