x64: Add non-SSE4.1 lowerings of ceil/trunc/floor/nearest (#6224)

* x64: Add non-SSE4.1 lowerings of ceil/trunc/floor/nearest

This commit adds lowerings that work with SSE2 for CLIF `ceil`, `trunc`,
`floor`, and `nearest` instructions over vectors. To get these working
`insertlane` for float vectors was also implemented for non-SSE4.1
instructions as well.

Note that the goal of these lowerings is not speed but rather "it
works", so the decompose-to-call-libcalls logic for vector is probably
horrendously slow but should at least be correct.

* Skip new tests on riscv64

* Update cranelift/codegen/src/isa/x64/inst.isle

Co-authored-by: Andrew Brown <andrew.brown@intel.com>

---------

Co-authored-by: Andrew Brown <andrew.brown@intel.com>
This commit is contained in:
Alex Crichton
2023-04-18 12:23:18 -05:00
committed by GitHub
parent 299131ae2d
commit 62f8928bee
6 changed files with 180 additions and 96 deletions

View File

@@ -3119,9 +3119,9 @@
(if-let $true (use_avx_simd))
(xmm_rmr_imm_vex (AvxOpcode.Vpblendw) src1 src2 imm))
;; Helper for creating a `movsd` instruction which creates a new vector
;; register where the upper 64-bits are from the first operand and the low
;; 64-bits are from the second operand.
;; Helper for creating `movsd`/`movss` instructions which create a new vector
;; register where the upper bits are from the first operand and the low
;; bits are from the second operand.
;;
;; Note that the second argument here is specifically `Xmm` instead of `XmmMem`
;; because there is no encoding of a 3-operand form of `movsd` and otherwise
@@ -3134,6 +3134,13 @@
(if-let $true (use_avx_simd))
(xmm_rmir_vex (AvxOpcode.Vmovsd) src1 src2))
(decl x64_movss_regmove (Xmm Xmm) Xmm)
(rule (x64_movss_regmove src1 src2)
(xmm_rm_r_unaligned (SseOpcode.Movss) src1 src2))
(rule 1 (x64_movss_regmove src1 src2)
(if-let $true (use_avx_simd))
(xmm_rmir_vex (AvxOpcode.Vmovss) src1 src2))
;; Helper for creating `movlhps` instructions.
(decl x64_movlhps (Xmm XmmMem) Xmm)
(rule 0 (x64_movlhps src1 src2)

View File

@@ -1333,8 +1333,38 @@
(x64_pinsrq vec val idx))
;; f32x4.replace_lane
(rule (vec_insert_lane $F32X4 vec val idx)
(x64_insertps vec val (sse_insertps_lane_imm idx)))
(rule 1 (vec_insert_lane $F32X4 vec val idx)
(if-let $true (use_sse41))
(x64_insertps vec val (sse_insertps_lane_imm idx)))
;; f32x4.replace_lane 0 - without insertps
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 0)
(x64_movss_regmove vec val))
;; f32x4.replace_lane 1 - without insertps
;; tmp = [ vec[1] vec[0] val[1] val[0] ]
;; result = [ vec[3] vec[2] tmp[0] tmp[2] ]
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 1)
(let ((tmp Xmm (x64_movlhps val vec)))
(x64_shufps tmp vec 0xe2))) ;; 0xe2 == 0b11_10_00_10
;; f32x4.replace_lane 2 - without insertps
;; tmp = [ vec[0] vec[3] val[0] val[0] ]
;; result = [ tmp[2] tmp[0] vec[1] vec[0] ]
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 2)
(let ((tmp Xmm (x64_shufps val vec 0x30))) ;; 0x30 == 0b00_11_00_00
(x64_shufps vec tmp 0x84))) ;; 0x84 == 0b10_00_01_00
;; f32x4.replace_lane 3 - without insertps
;; tmp = [ vec[3] vec[2] val[1] val[0] ]
;; result = [ tmp[0] tmp[2] vec[1] vec[0] ]
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 3)
(let ((tmp Xmm (x64_shufps val vec 0xe4))) ;; 0xe4 == 0b11_10_01_00
(x64_shufps vec tmp 0x24))) ;; 0x24 == 0b00_10_01_00
;; Recursively delegate to the above rules by loading from memory first.
(rule (vec_insert_lane $F32X4 vec (RegMem.Mem addr) idx)
(vec_insert_lane $F32X4 vec (x64_movss_load addr) idx))
;; External rust code used to calculate the immediate value to `insertps`.
(decl sse_insertps_lane_imm (u8) u8)
@@ -3354,101 +3384,78 @@
(x64_andnpd sign_bit a)
(x64_andpd sign_bit b))))
;; Helper for the `ceil`/`floor`/`nearest`/`trunc` instructions ;;;;;;;;;;;;;;;;
;; Emits either a `round{ss,sd,ps,pd}` instruction, as appropriate, or generates
;; the appropriate libcall and sequence to call that.
(decl x64_round (Type Value RoundImm) Xmm)
(rule 1 (x64_round $F32 a imm)
(if-let $true (use_sse41))
(x64_roundss a imm))
(rule 1 (x64_round $F64 a imm)
(if-let $true (use_sse41))
(x64_roundsd a imm))
(rule 1 (x64_round $F32X4 a imm)
(if-let $true (use_sse41))
(x64_roundps a imm))
(rule 1 (x64_round $F64X2 a imm)
(if-let $true (use_sse41))
(x64_roundpd a imm))
(rule (x64_round $F32 a imm) (libcall_1 (round_libcall $F32 imm) a))
(rule (x64_round $F64 a imm) (libcall_1 (round_libcall $F64 imm) a))
(rule (x64_round $F32X4 a imm)
(let (
(a Xmm a)
(libcall LibCall (round_libcall $F32 imm))
(result Xmm (libcall_1 libcall a))
(a1 Xmm (libcall_1 libcall (x64_pshufd a 1)))
(result Xmm (vec_insert_lane $F32X4 result a1 1))
(a2 Xmm (libcall_1 libcall (x64_pshufd a 2)))
(result Xmm (vec_insert_lane $F32X4 result a2 2))
(a3 Xmm (libcall_1 libcall (x64_pshufd a 3)))
(result Xmm (vec_insert_lane $F32X4 result a3 3))
)
result))
(rule (x64_round $F64X2 a imm)
(let (
(a Xmm a)
(libcall LibCall (round_libcall $F64 imm))
(result Xmm (libcall_1 libcall a))
(a1 Xmm (libcall_1 libcall (x64_pshufd a 0x0e))) ;; 0x0e == 0b00_00_11_10
(result Xmm (vec_insert_lane $F64X2 result a1 1))
)
result))
(decl round_libcall (Type RoundImm) LibCall)
(rule (round_libcall $F32 (RoundImm.RoundUp)) (LibCall.CeilF32))
(rule (round_libcall $F64 (RoundImm.RoundUp)) (LibCall.CeilF64))
(rule (round_libcall $F32 (RoundImm.RoundDown)) (LibCall.FloorF32))
(rule (round_libcall $F64 (RoundImm.RoundDown)) (LibCall.FloorF64))
(rule (round_libcall $F32 (RoundImm.RoundNearest)) (LibCall.NearestF32))
(rule (round_libcall $F64 (RoundImm.RoundNearest)) (LibCall.NearestF64))
(rule (round_libcall $F32 (RoundImm.RoundZero)) (LibCall.TruncF32))
(rule (round_libcall $F64 (RoundImm.RoundZero)) (LibCall.TruncF64))
;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (ceil a @ (value_type $F32)))
(if-let $true (use_sse41))
(x64_roundss a (RoundImm.RoundUp)))
(rule 1 (lower (ceil a @ (value_type $F64)))
(if-let $true (use_sse41))
(x64_roundsd a (RoundImm.RoundUp)))
(rule 1 (lower (ceil a @ (value_type $F32X4)))
(if-let $true (use_sse41))
(x64_roundps a (RoundImm.RoundUp)))
(rule 1 (lower (ceil a @ (value_type $F64X2)))
(if-let $true (use_sse41))
(x64_roundpd a (RoundImm.RoundUp)))
(rule (lower (ceil a @ (value_type $F32)))
(libcall_1 (LibCall.CeilF32) a))
(rule (lower (ceil a @ (value_type $F64)))
(libcall_1 (LibCall.CeilF64) a))
(rule (lower (ceil a @ (value_type ty)))
(x64_round ty a (RoundImm.RoundUp)))
;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (floor a @ (value_type $F32)))
(if-let $true (use_sse41))
(x64_roundss a (RoundImm.RoundDown)))
(rule 1 (lower (floor a @ (value_type $F64)))
(if-let $true (use_sse41))
(x64_roundsd a (RoundImm.RoundDown)))
(rule 1 (lower (floor a @ (value_type $F32X4)))
(if-let $true (use_sse41))
(x64_roundps a (RoundImm.RoundDown)))
(rule 1 (lower (floor a @ (value_type $F64X2)))
(if-let $true (use_sse41))
(x64_roundpd a (RoundImm.RoundDown)))
(rule (lower (floor a @ (value_type $F32)))
(libcall_1 (LibCall.FloorF32) a))
(rule (lower (floor a @ (value_type $F64)))
(libcall_1 (LibCall.FloorF64) a))
(rule (lower (floor a @ (value_type ty)))
(x64_round ty a (RoundImm.RoundDown)))
;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (nearest a @ (value_type $F32)))
(if-let $true (use_sse41))
(x64_roundss a (RoundImm.RoundNearest)))
(rule 1 (lower (nearest a @ (value_type $F64)))
(if-let $true (use_sse41))
(x64_roundsd a (RoundImm.RoundNearest)))
(rule 1 (lower (nearest a @ (value_type $F32X4)))
(if-let $true (use_sse41))
(x64_roundps a (RoundImm.RoundNearest)))
(rule 1 (lower (nearest a @ (value_type $F64X2)))
(if-let $true (use_sse41))
(x64_roundpd a (RoundImm.RoundNearest)))
(rule (lower (nearest a @ (value_type $F32)))
(libcall_1 (LibCall.NearestF32) a))
(rule (lower (nearest a @ (value_type $F64)))
(libcall_1 (LibCall.NearestF64) a))
(rule (lower (nearest a @ (value_type ty)))
(x64_round ty a (RoundImm.RoundNearest)))
;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (trunc a @ (value_type $F32)))
(if-let $true (use_sse41))
(x64_roundss a (RoundImm.RoundZero)))
(rule 1 (lower (trunc a @ (value_type $F64)))
(if-let $true (use_sse41))
(x64_roundsd a (RoundImm.RoundZero)))
(rule 1 (lower (trunc a @ (value_type $F32X4)))
(if-let $true (use_sse41))
(x64_roundps a (RoundImm.RoundZero)))
(rule 1 (lower (trunc a @ (value_type $F64X2)))
(if-let $true (use_sse41))
(x64_roundpd a (RoundImm.RoundZero)))
(rule (lower (trunc a @ (value_type $F32)))
(libcall_1 (LibCall.TruncF32) a))
(rule (lower (trunc a @ (value_type $F64)))
(libcall_1 (LibCall.TruncF64) a))
(rule (lower (trunc a @ (value_type ty)))
(x64_round ty a (RoundImm.RoundZero)))
;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@@ -3,10 +3,11 @@ test run
target x86_64
target x86_64 has_sse41=false
set enable_simd
target x86_64 has_avx
target x86_64 sse42 has_avx
target aarch64
target s390x
target riscv64
;; FIXME: needs support for vectors
;;target riscv64
function %ceil_f32(f32) -> f32 {
block0(v0: f32):
@@ -149,3 +150,19 @@ block0(v0: f64):
; run: %ceil_is_nan_f64(-sNaN:0x1) == 1
; run: %ceil_is_nan_f64(+sNaN:0x4000000000001) == 1
; run: %ceil_is_nan_f64(-sNaN:0x4000000000001) == 1
function %ceil_f32x4(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = ceil v0
return v1
}
; run: %ceil_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x1.0 0x1.0 0x1.0p1 0x1.8p1]
; run: %ceil_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1]
function %ceil_f64x2(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = ceil v0
return v1
}
; run: %ceil_f64x2([0x0.5 0x1.0]) == [0x1.0 0x1.0]
; run: %ceil_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]

View File

@@ -3,10 +3,11 @@ test run
target x86_64
target x86_64 has_sse41=false
set enable_simd
target x86_64 has_avx
target x86_64 sse42 has_avx
target aarch64
target s390x
target riscv64
;; FIXME: needs support for vectors
;;target riscv64
function %floor_f32(f32) -> f32 {
block0(v0: f32):
@@ -149,3 +150,19 @@ block0(v0: f64):
; run: %floor_is_nan_f64(-sNaN:0x1) == 1
; run: %floor_is_nan_f64(+sNaN:0x4000000000001) == 1
; run: %floor_is_nan_f64(-sNaN:0x4000000000001) == 1
function %floor_f32x4(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = floor v0
return v1
}
; run: %floor_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1]
; run: %floor_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x1.0 -0x1.0 -0x1.0p1 -0x1.8p1]
function %floor_f64x2(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = floor v0
return v1
}
; run: %floor_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0]
; run: %floor_f64x2([-0x0.5 -0x1.0]) == [-0x1.0 -0x1.0]

View File

@@ -3,10 +3,11 @@ test run
target x86_64
target x86_64 has_sse41=false
set enable_simd
target x86_64 has_avx
target x86_64 sse42 has_avx
target aarch64
target s390x
target riscv64
;; FIXME: needs support for vectors
;;target riscv64
function %nearest_f32(f32) -> f32 {
block0(v0: f32):
@@ -149,3 +150,19 @@ block0(v0: f64):
; run: %near_is_nan_f64(-sNaN:0x1) == 1
; run: %near_is_nan_f64(+sNaN:0x4000000000001) == 1
; run: %near_is_nan_f64(-sNaN:0x4000000000001) == 1
function %nearest_f32x4(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = nearest v0
return v1
}
; run: %nearest_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.8p1]
; run: %nearest_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.8p1]
function %nearest_f64x2(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = nearest v0
return v1
}
; run: %nearest_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0]
; run: %nearest_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]

View File

@@ -2,9 +2,12 @@ test interpret
test run
target x86_64
target x86_64 has_sse41=false
set enable_simd
target x86_64 sse42 has_avx
target aarch64
target s390x
target riscv64
;; FIXME: needs support for vectors
;;target riscv64
function %trunc_f32(f32) -> f32 {
block0(v0: f32):
@@ -147,3 +150,19 @@ block0(v0: f64):
; run: %trunc_is_nan_f64(-sNaN:0x1) == 1
; run: %trunc_is_nan_f64(+sNaN:0x4000000000001) == 1
; run: %trunc_is_nan_f64(-sNaN:0x4000000000001) == 1
function %trunc_f32x4(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = trunc v0
return v1
}
; run: %trunc_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1]
; run: %trunc_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1]
function %trunc_f64x2(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = trunc v0
return v1
}
; run: %trunc_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0]
; run: %trunc_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]