x64: Add AVX support for some more float-related instructions (#6092)

* x64: Add AVX encodings of `vcvt{ss2sd,sd2ss}`

Additionally update the instruction helpers to take an `XmmMem` argument
to allow load sinking into the instruction.

* x64: Add AVX encoding of `sqrts{s,d}`

* x64: Add AVX support for `rounds{s,d}`
This commit is contained in:
Alex Crichton
2023-03-29 13:09:49 -05:00
committed by GitHub
parent afb417920d
commit 0b0ac3ff73
8 changed files with 543 additions and 11 deletions

View File

@@ -0,0 +1,104 @@
test compile precise-output
set enable_simd
target x86_64 has_avx
function %f1(f32) -> f32 {
block0(v0: f32):
v1 = ceil v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vroundss $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vroundss $2, %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f2(f64) -> f64 {
block0(v0: f64):
v1 = ceil v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vroundsd $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vroundsd $2, %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f4(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = ceil v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vroundps $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vroundps $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %f4(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = ceil v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vroundpd $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vroundpd $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -0,0 +1,130 @@
test compile precise-output
set enable_simd
target x86_64 has_avx
function %fpromote(f32) -> f64 {
block0(v0: f32):
v1 = fpromote.f64 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vcvtss2sd %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vcvtss2sd %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %fpromote_load(i64, f32) -> f64 {
ss0 = explicit_slot 16
block0(v1: i64, v2: f32):
v3 = stack_addr.i64 ss0
store.f32 v2, v3
v4 = load.f32 v3
v5 = fpromote.f64 v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; subq %rsp, $16, %rsp
; block0:
; lea rsp(0 + virtual offset), %rdx
; vmovss %xmm0, 0(%rdx)
; vcvtss2sd 0(%rdx), %xmm0
; addq %rsp, $16, %rsp
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; subq $0x10, %rsp
; block1: ; offset 0x8
; leaq (%rsp), %rdx
; vmovss %xmm0, (%rdx) ; trap: heap_oob
; vcvtss2sd (%rdx), %xmm0, %xmm0 ; trap: heap_oob
; addq $0x10, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq
function %fdemote(f64) -> f32 {
block0(v0: f64):
v1 = fdemote.f32 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vcvtsd2ss %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vcvtsd2ss %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %fdemote_load(i64, f64) -> f32 {
ss0 = explicit_slot 16
block0(v1: i64, v2: f64):
v3 = stack_addr.i64 ss0
store.f64 v2, v3
v4 = load.f64 v3
v5 = fdemote.f32 v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; subq %rsp, $16, %rsp
; block0:
; lea rsp(0 + virtual offset), %rdx
; vmovsd %xmm0, 0(%rdx)
; vcvtsd2ss 0(%rdx), %xmm0
; addq %rsp, $16, %rsp
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; subq $0x10, %rsp
; block1: ; offset 0x8
; leaq (%rsp), %rdx
; vmovsd %xmm0, (%rdx) ; trap: heap_oob
; vcvtsd2ss (%rdx), %xmm0, %xmm0 ; trap: heap_oob
; addq $0x10, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -0,0 +1,130 @@
test compile precise-output
set enable_simd
target x86_64
function %fpromote(f32) -> f64 {
block0(v0: f32):
v1 = fpromote.f64 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; cvtss2sd %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; cvtss2sd %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %fpromote_load(i64, f32) -> f64 {
ss0 = explicit_slot 16
block0(v1: i64, v2: f32):
v3 = stack_addr.i64 ss0
store.f32 v2, v3
v4 = load.f32 v3
v5 = fpromote.f64 v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; subq %rsp, $16, %rsp
; block0:
; lea rsp(0 + virtual offset), %rdx
; movss %xmm0, 0(%rdx)
; cvtss2sd 0(%rdx), %xmm0
; addq %rsp, $16, %rsp
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; subq $0x10, %rsp
; block1: ; offset 0x8
; leaq (%rsp), %rdx
; movss %xmm0, (%rdx) ; trap: heap_oob
; cvtss2sd (%rdx), %xmm0 ; trap: heap_oob
; addq $0x10, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq
function %fdemote(f64) -> f32 {
block0(v0: f64):
v1 = fdemote.f32 v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; cvtsd2ss %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; cvtsd2ss %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %fdemote_load(i64, f64) -> f32 {
ss0 = explicit_slot 16
block0(v1: i64, v2: f64):
v3 = stack_addr.i64 ss0
store.f64 v2, v3
v4 = load.f64 v3
v5 = fdemote.f32 v4
return v5
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; subq %rsp, $16, %rsp
; block0:
; lea rsp(0 + virtual offset), %rdx
; movsd %xmm0, 0(%rdx)
; cvtsd2ss 0(%rdx), %xmm0
; addq %rsp, $16, %rsp
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; subq $0x10, %rsp
; block1: ; offset 0x8
; leaq (%rsp), %rdx
; movsd %xmm0, (%rdx) ; trap: heap_oob
; cvtsd2ss (%rdx), %xmm0 ; trap: heap_oob
; addq $0x10, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -0,0 +1,54 @@
test compile precise-output
set enable_simd
target x86_64 has_avx
function %sqrt_f32(f32) -> f32 {
block0(v0: f32):
v1 = sqrt v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vsqrtss %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vsqrtss %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %sqrt_f64(f64) -> f64 {
block0(v0: f64):
v1 = sqrt v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vsqrtsd %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vsqrtsd %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

View File

@@ -0,0 +1,54 @@
test compile precise-output
set enable_simd
target x86_64
function %sqrt_f32(f32) -> f32 {
block0(v0: f32):
v1 = sqrt v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; sqrtss %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; sqrtss %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq
function %sqrt_f64(f64) -> f64 {
block0(v0: f64):
v1 = sqrt v0
return v1
}
; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; sqrtsd %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; sqrtsd %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq