From d6ce632b5ba3ad7f29aedc5baf0ae12b14623cc6 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 13 Mar 2023 17:37:59 -0500 Subject: [PATCH] aarch64: Specialize constant vector shifts (#5976) * aarch64: Specialize constant vector shifts This commit adds special lowering rules for vector-shifts-by-constant-amounts to use dedicated instructions which cuts down on the codegen here quite a bit for constant values. * Fix codegen for 0-shift-rights * Special-case zero left-shifts as well * Remove left-shift special case --- cranelift/codegen/src/isa/aarch64/inst.isle | 14 +- cranelift/codegen/src/isa/aarch64/lower.isle | 44 +- .../codegen/src/isa/aarch64/lower/isle.rs | 4 + .../filetests/isa/aarch64/arithmetic.clif | 12 +- .../isa/aarch64/simd-arithmetic.clif | 392 ++++++++++++++++++ .../isa/aarch64/simd-bitwise-compile.clif | 28 +- .../filetests/runtests/simd-arithmetic.clif | 105 +++++ 7 files changed, 554 insertions(+), 45 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index 8d182a92de..aa7b296078 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -2633,6 +2633,18 @@ (decl ushl (Reg Reg VectorSize) Reg) (rule (ushl x y size) (vec_rrr (VecALUOp.Ushl) x y size)) +;; Helpers for generating `ushl` instructions. +(decl ushl_vec_imm (Reg u8 VectorSize) Reg) +(rule (ushl_vec_imm x amt size) (vec_shift_imm (VecShiftImmOp.Shl) amt x size)) + +;; Helpers for generating `ushr` instructions. +(decl ushr_vec_imm (Reg u8 VectorSize) Reg) +(rule (ushr_vec_imm x amt size) (vec_shift_imm (VecShiftImmOp.Ushr) amt x size)) + +;; Helpers for generating `sshr` instructions. +(decl sshr_vec_imm (Reg u8 VectorSize) Reg) +(rule (sshr_vec_imm x amt size) (vec_shift_imm (VecShiftImmOp.Sshr) amt x size)) + ;; Helpers for generating `rotr` instructions. (decl a64_rotr (Type Reg Reg) Reg) @@ -3321,7 +3333,7 @@ dst)) (rule (fcopy_sign x y ty @ (multi_lane _ _)) (let ((dst WritableReg (temp_writable_reg $I8X16)) - (tmp Reg (vec_shift_imm (VecShiftImmOp.Ushr) (max_shift (lane_type ty)) y (vector_size ty))) + (tmp Reg (ushr_vec_imm y (max_shift (lane_type ty)) (vector_size ty))) (_ Unit (emit (MInst.VecShiftImmMod (VecShiftImmModOp.Sli) dst x tmp (vector_size ty) (max_shift (lane_type ty)))))) dst)) diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 2b0d678f14..ba037d7cfd 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -352,10 +352,8 @@ (let ((one Reg (splat_const 1 (VectorSize.Size64x2))) (c Reg (orr_vec x y (VectorSize.Size64x2))) (c Reg (and_vec c one (VectorSize.Size64x2))) - (x Reg (vec_shift_imm (VecShiftImmOp.Ushr) 1 x - (VectorSize.Size64x2))) - (y Reg (vec_shift_imm (VecShiftImmOp.Ushr) 1 y - (VectorSize.Size64x2))) + (x Reg (ushr_vec_imm x 1 (VectorSize.Size64x2))) + (y Reg (ushr_vec_imm y 1 (VectorSize.Size64x2))) (sum Reg (add_vec x y (VectorSize.Size64x2)))) (add_vec c sum (VectorSize.Size64x2)))) @@ -1291,11 +1289,16 @@ (csel (Cond.Ne) lo_lshift maybe_hi))))) ;; Shift for vector types. -(rule -2 (lower (has_type (ty_vec128 ty) (ishl x y))) +(rule -3 (lower (has_type (ty_vec128 ty) (ishl x y))) (let ((size VectorSize (vector_size ty)) (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty))) (shift Reg (vec_dup masked_shift_amt size))) (sshl x shift size))) +(rule -2 (lower (has_type (ty_vec128 ty) (ishl x (iconst (u64_from_imm64 n))))) + (ushl_vec_imm x (shift_masked_imm ty n) (vector_size ty))) + +(decl pure shift_masked_imm (Type u64) u8) +(extern constructor shift_masked_imm shift_masked_imm) ;; Helper function to emit a shift operation with the opcode specified and ;; the output type specified. The `Reg` provided is shifted by the `Value` @@ -1351,11 +1354,20 @@ (lower_ushr128 x (value_regs_get y 0))) ;; Vector shifts. -(rule -2 (lower (has_type (ty_vec128 ty) (ushr x y))) +;; +;; Note that for constant shifts a 0-width shift can't be emitted so it's +;; special cased to pass through the input as-is since a 0-shift doesn't modify +;; the input anyway. +(rule -4 (lower (has_type (ty_vec128 ty) (ushr x y))) (let ((size VectorSize (vector_size ty)) (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty))) (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size))) (ushl x shift size))) +(rule -3 (lower (has_type (ty_vec128 ty) (ushr x (iconst (u64_from_imm64 n))))) + (ushr_vec_imm x (shift_masked_imm ty n) (vector_size ty))) +(rule -2 (lower (has_type (ty_vec128 ty) (ushr x (iconst (u64_from_imm64 n))))) + (if-let 0 (shift_masked_imm ty n)) + x) ;; lsr lo_rshift, src_lo, amt ;; lsr hi_rshift, src_hi, amt @@ -1387,7 +1399,7 @@ ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Shift for i8/i16/i32. -(rule -2 (lower (has_type (fits_in_32 ty) (sshr x y))) +(rule -4 (lower (has_type (fits_in_32 ty) (sshr x y))) (do_shift (ALUOp.Asr) ty (put_in_reg_sext32 x) y)) ;; Shift for i64. @@ -1400,12 +1412,20 @@ ;; Vector shifts. ;; -;; Note that right shifts are implemented with a negative left shift. -(rule -1 (lower (has_type (ty_vec128 ty) (sshr x y))) +;; Note that right shifts are implemented with a negative left shift. Also note +;; that for constant shifts a 0-width shift can't be emitted so it's special +;; cased to pass through the input as-is since a 0-shift doesn't modify the +;; input anyway. +(rule -3 (lower (has_type (ty_vec128 ty) (sshr x y))) (let ((size VectorSize (vector_size ty)) (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty))) (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size))) (sshl x shift size))) +(rule -2 (lower (has_type (ty_vec128 ty) (sshr x (iconst (u64_from_imm64 n))))) + (sshr_vec_imm x (shift_masked_imm ty n) (vector_size ty))) +(rule -1 (lower (has_type (ty_vec128 ty) (sshr x (iconst (u64_from_imm64 n))))) + (if-let 0 (shift_masked_imm ty n)) + x) ;; lsr lo_rshift, src_lo, amt ;; asr hi_rshift, src_hi, amt @@ -2452,7 +2472,7 @@ (let ( ;; Replicate the MSB of each of the 16 byte lanes across ;; the whole lane (sshr is an arithmetic right shift). - (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 7 vec (VectorSize.Size8x16))) + (shifted Reg (sshr_vec_imm vec 7 (VectorSize.Size8x16))) ;; Bitwise-and with a mask ;; `0x80402010_08040201_80402010_08040201` to get the bit ;; in the proper location for each group of 8 lanes. @@ -2476,7 +2496,7 @@ (let ( ;; Replicate the MSB of each of the 8 16-bit lanes across ;; the whole lane (sshr is an arithmetic right shift). - (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 15 vec (VectorSize.Size16x8))) + (shifted Reg (sshr_vec_imm vec 15 (VectorSize.Size16x8))) ;; Bitwise-and with a mask ;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the ;; bit in the proper location for each group of 4 lanes. @@ -2489,7 +2509,7 @@ (let ( ;; Replicate the MSB of each of the 4 32-bit lanes across ;; the whole lane (sshr is an arithmetic right shift). - (shifted Reg (vec_shift_imm (VecShiftImmOp.Sshr) 31 vec (VectorSize.Size32x4))) + (shifted Reg (sshr_vec_imm vec 31 (VectorSize.Size32x4))) ;; Bitwise-and with a mask ;; `0x00000008_00000004_00000002_00000001` to get the bit ;; in the proper location for each group of 4 lanes. diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index d4545565d8..883cb41a3f 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -806,4 +806,8 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> { None } } + + fn shift_masked_imm(&mut self, ty: Type, imm: u64) -> u8 { + (imm as u8) & ((ty.lane_bits() - 1) as u8) + } } diff --git a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif index 4f8669b161..7084651da7 100644 --- a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif +++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif @@ -561,20 +561,12 @@ block0(v0: i8x16): ; VCode: ; block0: -; movz x2, #1 -; and w4, w2, #7 -; sub x6, xzr, x4 -; dup v16.16b, w6 -; ushl v0.16b, v0.16b, v16.16b +; ushr v0.16b, v0.16b, #1 ; ret ; ; Disassembled: ; block0: ; offset 0x0 -; mov x2, #1 -; and w4, w2, #7 -; neg x6, x4 -; dup v16.16b, w6 -; ushl v0.16b, v0.16b, v16.16b +; ushr v0.16b, v0.16b, #1 ; ret function %add_i128(i128, i128) -> i128 { diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif b/cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif index 5a05e8925b..eab07f7375 100644 --- a/cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif +++ b/cranelift/filetests/filetests/isa/aarch64/simd-arithmetic.clif @@ -128,3 +128,395 @@ block0(v0: i64x2, v1: i64x2): ; add v0.2d, v17.2d, v23.2d ; ret +function %ishl_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 1 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; block0: +; shl v0.16b, v0.16b, #1 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; shl v0.16b, v0.16b, #1 +; ret + +function %ishl_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 15 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; block0: +; shl v0.8h, v0.8h, #15 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; shl v0.8h, v0.8h, #0xf +; ret + +function %ishl_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 22 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; block0: +; shl v0.4s, v0.4s, #22 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; shl v0.4s, v0.4s, #0x16 +; ret + +function %ishl_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 55 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; block0: +; shl v0.2d, v0.2d, #55 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; shl v0.2d, v0.2d, #0x37 +; ret + +function %sshr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 1 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; block0: +; sshr v0.16b, v0.16b, #1 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; sshr v0.16b, v0.16b, #1 +; ret + +function %sshr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 15 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; block0: +; sshr v0.8h, v0.8h, #15 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; sshr v0.8h, v0.8h, #0xf +; ret + +function %sshr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 22 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; block0: +; sshr v0.4s, v0.4s, #22 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; sshr v0.4s, v0.4s, #0x16 +; ret + +function %sshr_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 55 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; block0: +; sshr v0.2d, v0.2d, #55 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; sshr v0.2d, v0.2d, #0x37 +; ret + +function %ushr_i8x16_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 1 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; block0: +; ushr v0.16b, v0.16b, #1 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ushr v0.16b, v0.16b, #1 +; ret + +function %ushr_i16x8_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 15 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; block0: +; ushr v0.8h, v0.8h, #15 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ushr v0.8h, v0.8h, #0xf +; ret + +function %ushr_i32x4_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 22 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; block0: +; ushr v0.4s, v0.4s, #22 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ushr v0.4s, v0.4s, #0x16 +; ret + +function %ushr_i64x2_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 55 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; block0: +; ushr v0.2d, v0.2d, #55 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ushr v0.2d, v0.2d, #0x37 +; ret + +function %ishl_i8x16_full_width(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 8 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; block0: +; shl v0.16b, v0.16b, #0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; shl v0.16b, v0.16b, #0 +; ret + +function %ishl_i16x8_full_width(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 16 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; block0: +; shl v0.8h, v0.8h, #0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; shl v0.8h, v0.8h, #0 +; ret + +function %ishl_i32x4_full_width(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 32 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; block0: +; shl v0.4s, v0.4s, #0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; shl v0.4s, v0.4s, #0 +; ret + +function %ishl_i64x2_full_width(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 64 + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; block0: +; shl v0.2d, v0.2d, #0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; shl v0.2d, v0.2d, #0 +; ret + +function %sshr_i8x16_full_width(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 8 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; block0: +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ret + +function %sshr_i16x8_full_width(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 16 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; block0: +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ret + +function %sshr_i32x4_full_width(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 32 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; block0: +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ret + +function %sshr_i64x2_full_width(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 64 + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; block0: +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ret + +function %ushr_i8x16_full_width(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 8 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; block0: +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ret + +function %ushr_i16x8_full_width(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 16 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; block0: +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ret + +function %ushr_i32x4_full_width(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 32 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; block0: +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ret + +function %ushr_i64x2_full_width(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 64 + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; block0: +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ret + diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif index 9b7b851128..74b80897fa 100644 --- a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif @@ -259,22 +259,14 @@ block0: ; VCode: ; block0: -; ldr q5, [const(0)] -; movz w1, #1 -; and w3, w1, #7 -; sub x5, xzr, x3 -; dup v7.16b, w5 -; ushl v0.16b, v5.16b, v7.16b +; ldr q1, [const(0)] +; ushr v0.16b, v1.16b, #1 ; ret ; ; Disassembled: ; block0: ; offset 0x0 -; ldr q5, #0x20 -; mov w1, #1 -; and w3, w1, #7 -; neg x5, x3 -; dup v7.16b, w5 -; ushl v0.16b, v5.16b, v7.16b +; ldr q1, #0x10 +; ushr v0.16b, v1.16b, #1 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x01, 0x02, 0x03 @@ -321,20 +313,12 @@ block0(v0: i8x16, v1: i32): ; VCode: ; block0: -; movz w3, #3 -; and w5, w3, #7 -; sub x7, xzr, x5 -; dup v17.16b, w7 -; sshl v0.16b, v0.16b, v17.16b +; sshr v0.16b, v0.16b, #3 ; ret ; ; Disassembled: ; block0: ; offset 0x0 -; mov w3, #3 -; and w5, w3, #7 -; neg x7, x5 -; dup v17.16b, w7 -; sshl v0.16b, v0.16b, v17.16b +; sshr v0.16b, v0.16b, #3 ; ret function %sshr_i64x2(i64x2, i32) -> i64x2 { diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif index ec1e4ad018..171e5245e7 100644 --- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif @@ -195,3 +195,108 @@ block0(v0: i32x4): return v1 } ; run: %iabs([-42 -1 0 1]) == [42 1 0 1] + +function %i8x16_shl_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 2 + v2 = ishl v0, v1 + return v2 +} +; run: %i8x16_shl_imm([0x01 0x02 0x04 0x08 0x10 0x20 0x40 0x80 0 0 0 0 0 0 0 0]) == [0x04 0x08 0x10 0x20 0x40 0x80 0 0 0 0 0 0 0 0 0 0] + +function %i16x8_shl_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 4 + v2 = ishl v0, v1 + return v2 +} +; run: %i16x8_shl_imm([0x0001 0x0002 0x0004 0x0008 0x0010 0x0020 0x0040 0x0080]) == [0x0010 0x0020 0x0040 0x0080 0x0100 0x0200 0x0400 0x0800] +; run: %i16x8_shl_imm([0x0100 0x0200 0x0400 0x0800 0x1000 0x2000 0x4000 0x8000]) == [0x1000 0x2000 0x4000 0x8000 0 0 0 0] + +function %i32x4_shl_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 4 + v2 = ishl v0, v1 + return v2 +} +; run: %i32x4_shl_imm([0x00000001 0x00000002 0x00000004 0x00000008]) == [0x00000010 0x00000020 0x00000040 0x00000080] +; run: %i32x4_shl_imm([0x10000000 0x00010000 0xf0000000 0x02000000]) == [0 0x00100000 0 0x20000000] + +function %i64x2_shl_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 32 + v2 = ishl v0, v1 + return v2 +} +; run: %i64x2_shl_imm([0x1 0xf]) == [0x100000000 0xf00000000] +; run: %i64x2_shl_imm([0x100000000 0]) == [0 0] + +function %i8x16_sshr_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 2 + v2 = sshr v0, v1 + return v2 +} +; run: %i8x16_shl_imm([0x01 0x02 0x04 0x08 0x10 0x20 0x40 0x80 0 0 0 0 0 0 0 0]) == [0 0 0x01 0x02 0x04 0x08 0x10 0xe0 0 0 0 0 0 0 0 0] + +function %i16x8_sshr_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 4 + v2 = sshr v0, v1 + return v2 +} +; run: %i16x8_sshr_imm([0x0001 0x0002 0x0004 0x0008 0x0010 0x0020 0x0040 0x0080]) == [0 0 0 0 0x1 0x2 0x4 0x8] +; run: %i16x8_sshr_imm([-1 -2 -4 -8 -16 16 0x8000 0x80f3]) == [-1 -1 -1 -1 -1 1 0xf800 0xf80f] + +function %i32x4_sshr_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 4 + v2 = sshr v0, v1 + return v2 +} +; run: %i32x4_sshr_imm([1 0xfc 0x80000000 0xf83f3000]) == [0 0xf 0xf8000000 0xff83f300] + +function %i64x2_sshr_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 32 + v2 = sshr v0, v1 + return v2 +} +; run: %i64x2_sshr_imm([0x1 0xf]) == [0 0] +; run: %i64x2_sshr_imm([0x100000000 0]) == [1 0] +; run: %i64x2_sshr_imm([-1 -1]) == [-1 -1] + +function %i8x16_ushr_imm(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i32 2 + v2 = ushr v0, v1 + return v2 +} +; run: %i8x16_shl_imm([0x01 0x02 0x04 0x08 0x10 0x20 0x40 0x80 0 0 0 0 0 0 0 0]) == [0 0 0x01 0x02 0x04 0x08 0x10 0x20 0 0 0 0 0 0 0 0] + +function %i16x8_ushr_imm(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i32 4 + v2 = ushr v0, v1 + return v2 +} +; run: %i16x8_ushr_imm([0x0001 0x0002 0x0004 0x0008 0x0010 0x0020 0x0040 0x0080]) == [0 0 0 0 0x1 0x2 0x4 0x8] +; run: %i16x8_ushr_imm([-1 -2 -4 -8 -16 16 0x8000 0x80f3]) == [0x0fff 0x0fff 0x0fff 0x0fff 0x0fff 1 0x0800 0x080f] + +function %i32x4_ushr_imm(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 4 + v2 = ushr v0, v1 + return v2 +} +; run: %i32x4_ushr_imm([1 0xfc 0x80000000 0xf83f3000]) == [0 0xf 0x08000000 0x0f83f300] + +function %i64x2_ushr_imm(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i32 32 + v2 = ushr v0, v1 + return v2 +} +; run: %i64x2_ushr_imm([0x1 0xf]) == [0 0] +; run: %i64x2_ushr_imm([0x100000000 0]) == [1 0] +; run: %i64x2_ushr_imm([-1 -1]) == [0xffffffff 0xffffffff]