x64: Add most remaining AVX lowerings (#5819)

* x64: Add most remaining AVX lowerings

This commit goes through `inst.isle` and adds a corresponding AVX
lowering for most SSE lowerings. I opted to skip instructions where the
SSE lowering didn't read/modify a register, such as `roundps`. I think
that AVX will benefit these instructions when there's load-merging since
AVX doesn't require alignment, but I've deferred that work to a future
PR.

Otherwise though in this PR I think all (or almost all) of the 3-operand
forms of AVX instructions are supported with their SSE counterparts.
This should ideally improve codegen slightly by removing register
pressure and the need for `movdqa` between registers. I've attempted to
ensure that there's at least one codegen test for all the new instructions.

As a side note, the recent capstone integration into `precise-output`
tests helped me catch a number of encoding bugs much earlier than
otherwise, so I've found that incredibly useful in tests!

* Move `vpinsr*` instructions to their own variant

Use true `XmmMem` and `GprMem` types in the instruction as well to get
more type-level safety for what goes where.

* Remove `Inst::produces_const` accessor

Instead of conditionally defining regalloc and various other operations
instead add dedicated `MInst` variants for operations which are intended
to produce a constant to have more clear interactions with regalloc and
printing and such.

* Fix tests

* Register traps in `MachBuffer` for load-folding ops

This adds a missing `add_trap` to encoding of VEX instructions with
memory operands to ensure that if they cause a segfault that there's
appropriate metadata for Wasmtime to understand that the instruction
could in fact trap. This fixes a fuzz test case found locally where v8
trapped and Wasmtime didn't catch the signal and crashed the fuzzer.
This commit is contained in:
Alex Crichton
2023-02-20 09:11:52 -06:00
committed by GitHub
parent ad128b6811
commit c26a65a854
16 changed files with 4145 additions and 466 deletions

View File

@@ -1032,12 +1032,12 @@ block0(v0: f32x4):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; pxor %xmm2, %xmm2, %xmm2
; xorps %xmm5, %xmm5, %xmm5
; movdqa %xmm0, %xmm9
; maxps %xmm9, %xmm2, %xmm9
; pcmpeqd %xmm7, %xmm7, %xmm7
; psrld %xmm7, $1, %xmm7
; cvtdq2ps %xmm7, %xmm13
; maxps %xmm9, %xmm5, %xmm9
; pcmpeqd %xmm5, %xmm5, %xmm5
; psrld %xmm5, $1, %xmm5
; cvtdq2ps %xmm5, %xmm13
; cvttps2dq %xmm9, %xmm12
; subps %xmm9, %xmm13, %xmm9
; cmpps $2, %xmm13, %xmm9, %xmm13
@@ -1055,12 +1055,12 @@ block0(v0: f32x4):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; pxor %xmm2, %xmm2
; xorps %xmm5, %xmm5
; movdqa %xmm0, %xmm9
; maxps %xmm2, %xmm9
; pcmpeqd %xmm7, %xmm7
; psrld $1, %xmm7
; cvtdq2ps %xmm7, %xmm13
; maxps %xmm5, %xmm9
; pcmpeqd %xmm5, %xmm5
; psrld $1, %xmm5
; cvtdq2ps %xmm5, %xmm13
; cvttps2dq %xmm9, %xmm12
; subps %xmm13, %xmm9
; cmpleps %xmm9, %xmm13

File diff suppressed because it is too large Load Diff

View File

@@ -2,43 +2,6 @@ test compile precise-output
set enable_simd
target x86_64 has_avx
function %mask_from_icmp(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fmin v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vminps %xmm0, %xmm1, %xmm3
; vminps %xmm1, %xmm0, %xmm5
; vorps %xmm3, %xmm5, %xmm7
; vcmpps $3 %xmm7, %xmm5, %xmm9
; vorps %xmm7, %xmm9, %xmm11
; vpsrld %xmm9, $10, %xmm13
; vandnps %xmm13, %xmm11, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vminps %xmm1, %xmm0, %xmm3
; vminps %xmm0, %xmm1, %xmm5
; vorps %xmm5, %xmm3, %xmm7
; vcmpunordps %xmm5, %xmm7, %xmm9
; vorps %xmm9, %xmm7, %xmm11
; vpsrld $0xa, %xmm9, %xmm13
; vandnps %xmm11, %xmm13, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %or_from_memory(f32x4, i64) -> f32x4 {
block0(v0: f32x4, v1: i64):
v2 = load.f32x4 notrap aligned v1
@@ -81,10 +44,10 @@ block0(v0: i64):
; block0:
; movss 0(%rdi), %xmm7
; movl $-2147483648, %ecx
; movd %ecx, %xmm8
; vandnps %xmm8, const(0), %xmm9
; andps %xmm8, %xmm7, %xmm8
; vorps %xmm9, %xmm8, %xmm0
; movd %ecx, %xmm5
; vandnps %xmm5, const(0), %xmm8
; vandps %xmm5, %xmm7, %xmm9
; vorps %xmm8, %xmm9, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -96,10 +59,10 @@ block0(v0: i64):
; block1: ; offset 0x4
; movss (%rdi), %xmm7
; movl $0x80000000, %ecx
; movd %ecx, %xmm8
; vandnps 0x16(%rip), %xmm8, %xmm9
; andps %xmm7, %xmm8
; vorps %xmm8, %xmm9, %xmm0
; movd %ecx, %xmm5
; vandnps 0x17(%rip), %xmm5, %xmm8
; vandps %xmm7, %xmm5, %xmm9
; vorps %xmm9, %xmm8, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
@@ -166,9 +129,9 @@ block0(v0: f32x4, v1: f32x4):
; popq %rbp
; retq
function %i32x4_shr(i32x4, i32) -> i32x4 {
block0(v0: i32x4, v1: i32):
v2 = ushr v0, v1
function %band_not_f64x2(f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2):
v2 = band_not v0, v1
return v2
}
@@ -176,10 +139,7 @@ block0(v0: i32x4, v1: i32):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %rcx
; andq %rcx, $31, %rcx
; movd %ecx, %xmm5
; vpsrld %xmm0, %xmm5, %xmm0
; vandnpd %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -189,10 +149,32 @@ block0(v0: i32x4, v1: i32):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %rcx
; andq $0x1f, %rcx
; movd %ecx, %xmm5
; vpsrld %xmm5, %xmm0, %xmm0
; vandnpd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %band_not_i64x2(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = band_not v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpandn %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpandn %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
@@ -209,7 +191,7 @@ block0(v0: f32x4):
; block0:
; pcmpeqd %xmm2, %xmm2, %xmm2
; vpsrld %xmm2, $1, %xmm4
; andps %xmm0, %xmm4, %xmm0
; vandps %xmm0, %xmm4, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
@@ -221,7 +203,457 @@ block0(v0: f32x4):
; block1: ; offset 0x4
; pcmpeqd %xmm2, %xmm2
; vpsrld $1, %xmm2, %xmm4
; andps %xmm4, %xmm0
; vandps %xmm4, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i16x8_and(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = band v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpand %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpand %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f32x4_and(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = band v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vandps %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vandps %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f64x2_and(f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2):
v2 = band v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vandpd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vandpd %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i16x8_or(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bor v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpor %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpor %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f32x4_or(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = bor v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vorps %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vorps %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f64x2_or(f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2):
v2 = bor v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vorpd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vorpd %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i16x8_xor(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = bxor v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpxor %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpxor %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f32x4_xor(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = bxor v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vxorps %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vxorps %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f64x2_xor(f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2):
v2 = bxor v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vxorpd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vxorpd %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i16x8_bitselect(i16x8, i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8, v2: i16x8):
v3 = vselect v0, v1, v2
return v3
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpblendvb %xmm0, %xmm1, %xmm0, %xmm2
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i32x4_bitselect(i32x4, f32x4, f32x4) -> f32x4 {
block0(v0: i32x4, v1: f32x4, v2: f32x4):
v3 = vselect v0, v1, v2
return v3
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vblendvps %xmm0, %xmm1, %xmm0, %xmm2
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i64x2_bitselect(i64x2, f64x2, f64x2) -> f64x2 {
block0(v0: i64x2, v1: f64x2, v2: f64x2):
v3 = vselect v0, v1, v2
return v3
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f32x4_replace_lane(f32x4, f32) -> f32x4 {
block0(v0: f32x4, v1: f32):
v2 = insertlane v0, v1, 1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vinsertps $16 %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vinsertps $0x10, %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f64x2_replace_lane(f64x2, f64) -> f64x2 {
block0(v0: f64x2, v1: f64):
v2 = insertlane v0, v1, 1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmovlhps %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmovlhps %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i8x16_replace_lane(i8x16, i8) -> i8x16 {
block0(v0: i8x16, v1: i8):
v2 = insertlane v0, v1, 1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpinsrb $1 %xmm0, %rdi, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpinsrb $1, %edi, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i16x8_replace_lane(i16x8, i16) -> i16x8 {
block0(v0: i16x8, v1: i16):
v2 = insertlane v0, v1, 1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpinsrw $1 %xmm0, %rdi, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpinsrw $1, %edi, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i32x4_replace_lane(i32x4, i32) -> i32x4 {
block0(v0: i32x4, v1: i32):
v2 = insertlane v0, v1, 1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpinsrd $1 %xmm0, %rdi, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpinsrd $1, %edi, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i64x2_replace_lane(i64x2, i64) -> i64x2 {
block0(v0: i64x2, v1: i64):
v2 = insertlane v0, v1, 1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpinsrq $1 %xmm0, %rdi, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpinsrq $1, %rdi, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -0,0 +1,656 @@
test compile precise-output
set enable_simd
target x86_64 has_avx
function %i8x16_eq(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = icmp eq v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpcmpeqb %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpcmpeqb %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i16x8_eq(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = icmp eq v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpcmpeqw %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpcmpeqw %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i32x4_eq(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = icmp eq v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpcmpeqd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpcmpeqd %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i64x2_eq(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = icmp eq v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpcmpeqq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpcmpeqq %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i8x16_gt(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = icmp sgt v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpcmpgtb %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpcmpgtb %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i16x8_gt(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = icmp sgt v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpcmpgtw %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpcmpgtw %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i32x4_gt(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = icmp sgt v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpcmpgtd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpcmpgtd %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i64x2_gt(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = icmp sgt v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpcmpgtq %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpcmpgtq %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f32x4_min(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fmin v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vminps %xmm0, %xmm1, %xmm3
; vminps %xmm1, %xmm0, %xmm5
; vorps %xmm3, %xmm5, %xmm7
; vcmpps $3 %xmm7, %xmm5, %xmm9
; vorps %xmm7, %xmm9, %xmm11
; vpsrld %xmm9, $10, %xmm13
; vandnps %xmm13, %xmm11, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vminps %xmm1, %xmm0, %xmm3
; vminps %xmm0, %xmm1, %xmm5
; vorps %xmm5, %xmm3, %xmm7
; vcmpunordps %xmm5, %xmm7, %xmm9
; vorps %xmm9, %xmm7, %xmm11
; vpsrld $0xa, %xmm9, %xmm13
; vandnps %xmm11, %xmm13, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f64x2_min(f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2):
v2 = fmin v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vminpd %xmm0, %xmm1, %xmm3
; vminpd %xmm1, %xmm0, %xmm5
; vorpd %xmm3, %xmm5, %xmm7
; vcmppd $3 %xmm3, %xmm5, %xmm9
; vorpd %xmm7, %xmm9, %xmm11
; vpsrlq %xmm9, $13, %xmm13
; vandnpd %xmm13, %xmm11, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vminpd %xmm1, %xmm0, %xmm3
; vminpd %xmm0, %xmm1, %xmm5
; vorpd %xmm5, %xmm3, %xmm7
; vcmpunordpd %xmm5, %xmm3, %xmm9
; vorpd %xmm9, %xmm7, %xmm11
; vpsrlq $0xd, %xmm9, %xmm13
; vandnpd %xmm11, %xmm13, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f32x4_max(f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4):
v2 = fmax v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmaxps %xmm0, %xmm1, %xmm3
; vmaxps %xmm1, %xmm0, %xmm5
; vxorps %xmm3, %xmm5, %xmm7
; vorps %xmm3, %xmm7, %xmm9
; vsubps %xmm9, %xmm7, %xmm11
; vcmpps $3 %xmm9, %xmm9, %xmm13
; vpsrld %xmm13, $10, %xmm15
; vandnps %xmm15, %xmm11, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmaxps %xmm1, %xmm0, %xmm3
; vmaxps %xmm0, %xmm1, %xmm5
; vxorps %xmm5, %xmm3, %xmm7
; vorps %xmm7, %xmm3, %xmm9
; vsubps %xmm7, %xmm9, %xmm11
; vcmpunordps %xmm9, %xmm9, %xmm13
; vpsrld $0xa, %xmm13, %xmm15
; vandnps %xmm11, %xmm15, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f64x2_max(f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2):
v2 = fmax v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vmaxpd %xmm0, %xmm1, %xmm3
; vmaxpd %xmm1, %xmm0, %xmm5
; vxorpd %xmm3, %xmm5, %xmm7
; vorpd %xmm3, %xmm7, %xmm9
; vsubpd %xmm9, %xmm7, %xmm11
; vcmppd $3 %xmm9, %xmm9, %xmm13
; vpsrlq %xmm13, $13, %xmm15
; vandnpd %xmm15, %xmm11, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vmaxpd %xmm1, %xmm0, %xmm3
; vmaxpd %xmm0, %xmm1, %xmm5
; vxorpd %xmm5, %xmm3, %xmm7
; vorpd %xmm7, %xmm3, %xmm9
; vsubpd %xmm7, %xmm9, %xmm11
; vcmpunordpd %xmm9, %xmm9, %xmm13
; vpsrlq $0xd, %xmm13, %xmm15
; vandnpd %xmm11, %xmm15, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i8x16_min(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = smin v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpminsb %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpminsb %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %u8x16_min(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = umin v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpminub %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpminub %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i16x8_min(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = smin v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpminsw %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpminsw %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %u16x8_min(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = umin v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpminuw %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpminuw %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i32x4_min(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = smin v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpminsd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpminsd %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %u32x4_min(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = umin v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpminud %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpminud %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i8x16_max(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = smax v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpmaxsb %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpmaxsb %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %u8x16_max(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = umax v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpmaxub %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpmaxub %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i16x8_max(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = smax v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpmaxsw %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpmaxsw %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %u16x8_max(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = umax v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpmaxuw %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpmaxuw %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %i32x4_max(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = smax v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpmaxsd %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpmaxsd %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %u32x4_max(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = umax v0, v1
return v2
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpmaxud %xmm0, %xmm1, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpmaxud %xmm1, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -1,6 +1,6 @@
test compile precise-output
set enable_simd
target x86_64 skylake
target x86_64
function %icmp_ne_32x4(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):

View File

@@ -1,6 +1,6 @@
test compile precise-output
set enable_simd
target x86_64 skylake
target x86_64
function %bnot_i32x4(i32x4) -> i32x4 {
block0(v0: i32x4):

View File

@@ -3,6 +3,7 @@ target aarch64
target s390x
set enable_simd
target x86_64 has_sse3 has_ssse3 has_sse41
target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
;; shuffle