test compile
set enable_simd
target x86_64 machinst has_ssse3 has_sse41

;; shuffle

function %shuffle_different_ssa_values() -> i8x16 {
block0:
    v0 = vconst.i8x16 0x00
    v1 = vconst.i8x16 0x01
    v2 = shuffle v0, v1, 0x11000000000000000000000000000000     ; pick the second lane of v1, the rest use the first lane of v0
    return v2
}
; check:  load_const VCodeConstant(3), %xmm1
; nextln: load_const VCodeConstant(2), %xmm0
; nextln: load_const VCodeConstant(0), %xmm2
; nextln: pshufb  %xmm2, %xmm1
; nextln: load_const VCodeConstant(1), %xmm2
; nextln: pshufb  %xmm2, %xmm0
; nextln: orps    %xmm1, %xmm0


function %shuffle_same_ssa_value() -> i8x16 {
block0:
    v1 = vconst.i8x16 0x01
    v2 = shuffle v1, v1, 0x13000000000000000000000000000000     ; pick the fourth lane of v1 and the rest from the first lane of v1
    return v2
}
; check:  load_const VCodeConstant(1), %xmm0
; nextln: load_const VCodeConstant(0), %xmm1
; nextln: pshufb  %xmm1, %xmm0


;; swizzle

function %swizzle() -> i8x16 {
block0:
    v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
    v2 = swizzle.i8x16 v0, v1
    return v2
}
; check:  load_const VCodeConstant(1), %xmm1
; nextln: load_const VCodeConstant(1), %xmm0
; nextln: load_const VCodeConstant(0), %xmm2
; nextln: paddusb %xmm2, %xmm0
; nextln: pshufb  %xmm0, %xmm1
; nextln: movdqa  %xmm1, %xmm0


;; splat

function %splat_i8(i8) -> i8x16 {
block0(v0: i8):
    v1 = splat.i8x16 v0
    return v1
}
; check:  uninit  %xmm0
; nextln: pinsrb  $$0, %rdi, %xmm0
; nextln: pxor    %xmm1, %xmm1
; nextln: pshufb  %xmm1, %xmm0

function %splat_b16() -> b16x8 {
block0:
    v0 = bconst.b16 true
    v1 = splat.b16x8 v0
    return v1
}
; check:  uninit  %xmm0
; nextln: pinsrw  $$0, %rsi, %xmm0
; nextln: pinsrw  $$1, %rsi, %xmm0
; nextln: pshufd  $$0, %xmm0, %xmm0

function %splat_i32(i32) -> i32x4 {
block0(v0: i32):
    v1 = splat.i32x4 v0
    return v1
}
; check:  uninit  %xmm0
; nextln: pinsrd  $$0, %rdi, %xmm0
; nextln: pshufd  $$0, %xmm0, %xmm0

function %splat_f64(f64) -> f64x2 {
block0(v0: f64):
    v1 = splat.f64x2 v0
    return v1
}
; check:  uninit  %xmm1
; nextln: movsd   %xmm0, %xmm1
; nextln: movlhps %xmm0, %xmm1


;; load*_zero

; Verify that a `load` followed by a `scalar_to_vector` (the CLIF translation of `load32_zero`) is
; lowered to a single MOVSS instruction.
function %load32_zero_coalesced(i64) -> i32x4 {
block0(v0: i64):
    v1 = load.i32 v0
    v2 = scalar_to_vector.i32x4 v1
    ; check:  movss   0(%rdi), %xmm0
    return v2
}

;; Verify that `scalar_to_vector` (used by `load32_zero`), lowers as expected.
function %load32_zero_int(i32) -> i32x4 {
block0(v0: i32):
    v1 = scalar_to_vector.i32x4 v0
    ; check:  movd    %edi, %xmm0
    return v1
}
function %load32_zero_float(f32) -> f32x4 {
block0(v0: f32):
    v1 = scalar_to_vector.f32x4 v0
    ; regex: MOV=movap*
    ; check: pushq
    ; not: $MOV
    ; check: ret
    return v1
}