x64: Add non-SSE4.1 lowerings of ceil/trunc/floor/nearest (#6224)
* x64: Add non-SSE4.1 lowerings of ceil/trunc/floor/nearest This commit adds lowerings that work with SSE2 for CLIF `ceil`, `trunc`, `floor`, and `nearest` instructions over vectors. To get these working `insertlane` for float vectors was also implemented for non-SSE4.1 instructions as well. Note that the goal of these lowerings is not speed but rather "it works", so the decompose-to-call-libcalls logic for vector is probably horrendously slow but should at least be correct. * Skip new tests on riscv64 * Update cranelift/codegen/src/isa/x64/inst.isle Co-authored-by: Andrew Brown <andrew.brown@intel.com> --------- Co-authored-by: Andrew Brown <andrew.brown@intel.com>
This commit is contained in:
@@ -3119,9 +3119,9 @@
|
||||
(if-let $true (use_avx_simd))
|
||||
(xmm_rmr_imm_vex (AvxOpcode.Vpblendw) src1 src2 imm))
|
||||
|
||||
;; Helper for creating a `movsd` instruction which creates a new vector
|
||||
;; register where the upper 64-bits are from the first operand and the low
|
||||
;; 64-bits are from the second operand.
|
||||
;; Helper for creating `movsd`/`movss` instructions which create a new vector
|
||||
;; register where the upper bits are from the first operand and the low
|
||||
;; bits are from the second operand.
|
||||
;;
|
||||
;; Note that the second argument here is specifically `Xmm` instead of `XmmMem`
|
||||
;; because there is no encoding of a 3-operand form of `movsd` and otherwise
|
||||
@@ -3134,6 +3134,13 @@
|
||||
(if-let $true (use_avx_simd))
|
||||
(xmm_rmir_vex (AvxOpcode.Vmovsd) src1 src2))
|
||||
|
||||
(decl x64_movss_regmove (Xmm Xmm) Xmm)
|
||||
(rule (x64_movss_regmove src1 src2)
|
||||
(xmm_rm_r_unaligned (SseOpcode.Movss) src1 src2))
|
||||
(rule 1 (x64_movss_regmove src1 src2)
|
||||
(if-let $true (use_avx_simd))
|
||||
(xmm_rmir_vex (AvxOpcode.Vmovss) src1 src2))
|
||||
|
||||
;; Helper for creating `movlhps` instructions.
|
||||
(decl x64_movlhps (Xmm XmmMem) Xmm)
|
||||
(rule 0 (x64_movlhps src1 src2)
|
||||
|
||||
@@ -1333,8 +1333,38 @@
|
||||
(x64_pinsrq vec val idx))
|
||||
|
||||
;; f32x4.replace_lane
|
||||
(rule (vec_insert_lane $F32X4 vec val idx)
|
||||
(x64_insertps vec val (sse_insertps_lane_imm idx)))
|
||||
(rule 1 (vec_insert_lane $F32X4 vec val idx)
|
||||
(if-let $true (use_sse41))
|
||||
(x64_insertps vec val (sse_insertps_lane_imm idx)))
|
||||
|
||||
;; f32x4.replace_lane 0 - without insertps
|
||||
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 0)
|
||||
(x64_movss_regmove vec val))
|
||||
|
||||
;; f32x4.replace_lane 1 - without insertps
|
||||
;; tmp = [ vec[1] vec[0] val[1] val[0] ]
|
||||
;; result = [ vec[3] vec[2] tmp[0] tmp[2] ]
|
||||
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 1)
|
||||
(let ((tmp Xmm (x64_movlhps val vec)))
|
||||
(x64_shufps tmp vec 0xe2))) ;; 0xe2 == 0b11_10_00_10
|
||||
|
||||
;; f32x4.replace_lane 2 - without insertps
|
||||
;; tmp = [ vec[0] vec[3] val[0] val[0] ]
|
||||
;; result = [ tmp[2] tmp[0] vec[1] vec[0] ]
|
||||
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 2)
|
||||
(let ((tmp Xmm (x64_shufps val vec 0x30))) ;; 0x30 == 0b00_11_00_00
|
||||
(x64_shufps vec tmp 0x84))) ;; 0x84 == 0b10_00_01_00
|
||||
|
||||
;; f32x4.replace_lane 3 - without insertps
|
||||
;; tmp = [ vec[3] vec[2] val[1] val[0] ]
|
||||
;; result = [ tmp[0] tmp[2] vec[1] vec[0] ]
|
||||
(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 3)
|
||||
(let ((tmp Xmm (x64_shufps val vec 0xe4))) ;; 0xe4 == 0b11_10_01_00
|
||||
(x64_shufps vec tmp 0x24))) ;; 0x24 == 0b00_10_01_00
|
||||
|
||||
;; Recursively delegate to the above rules by loading from memory first.
|
||||
(rule (vec_insert_lane $F32X4 vec (RegMem.Mem addr) idx)
|
||||
(vec_insert_lane $F32X4 vec (x64_movss_load addr) idx))
|
||||
|
||||
;; External rust code used to calculate the immediate value to `insertps`.
|
||||
(decl sse_insertps_lane_imm (u8) u8)
|
||||
@@ -3354,101 +3384,78 @@
|
||||
(x64_andnpd sign_bit a)
|
||||
(x64_andpd sign_bit b))))
|
||||
|
||||
;; Helper for the `ceil`/`floor`/`nearest`/`trunc` instructions ;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Emits either a `round{ss,sd,ps,pd}` instruction, as appropriate, or generates
|
||||
;; the appropriate libcall and sequence to call that.
|
||||
(decl x64_round (Type Value RoundImm) Xmm)
|
||||
(rule 1 (x64_round $F32 a imm)
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundss a imm))
|
||||
(rule 1 (x64_round $F64 a imm)
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundsd a imm))
|
||||
(rule 1 (x64_round $F32X4 a imm)
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundps a imm))
|
||||
(rule 1 (x64_round $F64X2 a imm)
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundpd a imm))
|
||||
|
||||
(rule (x64_round $F32 a imm) (libcall_1 (round_libcall $F32 imm) a))
|
||||
(rule (x64_round $F64 a imm) (libcall_1 (round_libcall $F64 imm) a))
|
||||
(rule (x64_round $F32X4 a imm)
|
||||
(let (
|
||||
(a Xmm a)
|
||||
(libcall LibCall (round_libcall $F32 imm))
|
||||
(result Xmm (libcall_1 libcall a))
|
||||
(a1 Xmm (libcall_1 libcall (x64_pshufd a 1)))
|
||||
(result Xmm (vec_insert_lane $F32X4 result a1 1))
|
||||
(a2 Xmm (libcall_1 libcall (x64_pshufd a 2)))
|
||||
(result Xmm (vec_insert_lane $F32X4 result a2 2))
|
||||
(a3 Xmm (libcall_1 libcall (x64_pshufd a 3)))
|
||||
(result Xmm (vec_insert_lane $F32X4 result a3 3))
|
||||
)
|
||||
result))
|
||||
(rule (x64_round $F64X2 a imm)
|
||||
(let (
|
||||
(a Xmm a)
|
||||
(libcall LibCall (round_libcall $F64 imm))
|
||||
(result Xmm (libcall_1 libcall a))
|
||||
(a1 Xmm (libcall_1 libcall (x64_pshufd a 0x0e))) ;; 0x0e == 0b00_00_11_10
|
||||
(result Xmm (vec_insert_lane $F64X2 result a1 1))
|
||||
)
|
||||
result))
|
||||
|
||||
(decl round_libcall (Type RoundImm) LibCall)
|
||||
(rule (round_libcall $F32 (RoundImm.RoundUp)) (LibCall.CeilF32))
|
||||
(rule (round_libcall $F64 (RoundImm.RoundUp)) (LibCall.CeilF64))
|
||||
(rule (round_libcall $F32 (RoundImm.RoundDown)) (LibCall.FloorF32))
|
||||
(rule (round_libcall $F64 (RoundImm.RoundDown)) (LibCall.FloorF64))
|
||||
(rule (round_libcall $F32 (RoundImm.RoundNearest)) (LibCall.NearestF32))
|
||||
(rule (round_libcall $F64 (RoundImm.RoundNearest)) (LibCall.NearestF64))
|
||||
(rule (round_libcall $F32 (RoundImm.RoundZero)) (LibCall.TruncF32))
|
||||
(rule (round_libcall $F64 (RoundImm.RoundZero)) (LibCall.TruncF64))
|
||||
|
||||
;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule 1 (lower (ceil a @ (value_type $F32)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundss a (RoundImm.RoundUp)))
|
||||
|
||||
(rule 1 (lower (ceil a @ (value_type $F64)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundsd a (RoundImm.RoundUp)))
|
||||
|
||||
(rule 1 (lower (ceil a @ (value_type $F32X4)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundps a (RoundImm.RoundUp)))
|
||||
|
||||
(rule 1 (lower (ceil a @ (value_type $F64X2)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundpd a (RoundImm.RoundUp)))
|
||||
|
||||
(rule (lower (ceil a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.CeilF32) a))
|
||||
|
||||
(rule (lower (ceil a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.CeilF64) a))
|
||||
(rule (lower (ceil a @ (value_type ty)))
|
||||
(x64_round ty a (RoundImm.RoundUp)))
|
||||
|
||||
;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule 1 (lower (floor a @ (value_type $F32)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundss a (RoundImm.RoundDown)))
|
||||
|
||||
(rule 1 (lower (floor a @ (value_type $F64)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundsd a (RoundImm.RoundDown)))
|
||||
|
||||
(rule 1 (lower (floor a @ (value_type $F32X4)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundps a (RoundImm.RoundDown)))
|
||||
|
||||
(rule 1 (lower (floor a @ (value_type $F64X2)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundpd a (RoundImm.RoundDown)))
|
||||
|
||||
(rule (lower (floor a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.FloorF32) a))
|
||||
|
||||
(rule (lower (floor a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.FloorF64) a))
|
||||
(rule (lower (floor a @ (value_type ty)))
|
||||
(x64_round ty a (RoundImm.RoundDown)))
|
||||
|
||||
;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule 1 (lower (nearest a @ (value_type $F32)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundss a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule 1 (lower (nearest a @ (value_type $F64)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundsd a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule 1 (lower (nearest a @ (value_type $F32X4)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundps a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule 1 (lower (nearest a @ (value_type $F64X2)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundpd a (RoundImm.RoundNearest)))
|
||||
|
||||
(rule (lower (nearest a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.NearestF32) a))
|
||||
|
||||
(rule (lower (nearest a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.NearestF64) a))
|
||||
(rule (lower (nearest a @ (value_type ty)))
|
||||
(x64_round ty a (RoundImm.RoundNearest)))
|
||||
|
||||
;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule 1 (lower (trunc a @ (value_type $F32)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundss a (RoundImm.RoundZero)))
|
||||
|
||||
(rule 1 (lower (trunc a @ (value_type $F64)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundsd a (RoundImm.RoundZero)))
|
||||
|
||||
(rule 1 (lower (trunc a @ (value_type $F32X4)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundps a (RoundImm.RoundZero)))
|
||||
|
||||
(rule 1 (lower (trunc a @ (value_type $F64X2)))
|
||||
(if-let $true (use_sse41))
|
||||
(x64_roundpd a (RoundImm.RoundZero)))
|
||||
|
||||
(rule (lower (trunc a @ (value_type $F32)))
|
||||
(libcall_1 (LibCall.TruncF32) a))
|
||||
|
||||
(rule (lower (trunc a @ (value_type $F64)))
|
||||
(libcall_1 (LibCall.TruncF64) a))
|
||||
(rule (lower (trunc a @ (value_type ty)))
|
||||
(x64_round ty a (RoundImm.RoundZero)))
|
||||
|
||||
;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
@@ -3,10 +3,11 @@ test run
|
||||
target x86_64
|
||||
target x86_64 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_avx
|
||||
target x86_64 sse42 has_avx
|
||||
target aarch64
|
||||
target s390x
|
||||
target riscv64
|
||||
;; FIXME: needs support for vectors
|
||||
;;target riscv64
|
||||
|
||||
function %ceil_f32(f32) -> f32 {
|
||||
block0(v0: f32):
|
||||
@@ -149,3 +150,19 @@ block0(v0: f64):
|
||||
; run: %ceil_is_nan_f64(-sNaN:0x1) == 1
|
||||
; run: %ceil_is_nan_f64(+sNaN:0x4000000000001) == 1
|
||||
; run: %ceil_is_nan_f64(-sNaN:0x4000000000001) == 1
|
||||
|
||||
function %ceil_f32x4(f32x4) -> f32x4 {
|
||||
block0(v0: f32x4):
|
||||
v1 = ceil v0
|
||||
return v1
|
||||
}
|
||||
; run: %ceil_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x1.0 0x1.0 0x1.0p1 0x1.8p1]
|
||||
; run: %ceil_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1]
|
||||
|
||||
function %ceil_f64x2(f64x2) -> f64x2 {
|
||||
block0(v0: f64x2):
|
||||
v1 = ceil v0
|
||||
return v1
|
||||
}
|
||||
; run: %ceil_f64x2([0x0.5 0x1.0]) == [0x1.0 0x1.0]
|
||||
; run: %ceil_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]
|
||||
|
||||
@@ -3,10 +3,11 @@ test run
|
||||
target x86_64
|
||||
target x86_64 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_avx
|
||||
target x86_64 sse42 has_avx
|
||||
target aarch64
|
||||
target s390x
|
||||
target riscv64
|
||||
;; FIXME: needs support for vectors
|
||||
;;target riscv64
|
||||
|
||||
function %floor_f32(f32) -> f32 {
|
||||
block0(v0: f32):
|
||||
@@ -149,3 +150,19 @@ block0(v0: f64):
|
||||
; run: %floor_is_nan_f64(-sNaN:0x1) == 1
|
||||
; run: %floor_is_nan_f64(+sNaN:0x4000000000001) == 1
|
||||
; run: %floor_is_nan_f64(-sNaN:0x4000000000001) == 1
|
||||
|
||||
function %floor_f32x4(f32x4) -> f32x4 {
|
||||
block0(v0: f32x4):
|
||||
v1 = floor v0
|
||||
return v1
|
||||
}
|
||||
; run: %floor_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1]
|
||||
; run: %floor_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x1.0 -0x1.0 -0x1.0p1 -0x1.8p1]
|
||||
|
||||
function %floor_f64x2(f64x2) -> f64x2 {
|
||||
block0(v0: f64x2):
|
||||
v1 = floor v0
|
||||
return v1
|
||||
}
|
||||
; run: %floor_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0]
|
||||
; run: %floor_f64x2([-0x0.5 -0x1.0]) == [-0x1.0 -0x1.0]
|
||||
|
||||
@@ -3,10 +3,11 @@ test run
|
||||
target x86_64
|
||||
target x86_64 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 has_avx
|
||||
target x86_64 sse42 has_avx
|
||||
target aarch64
|
||||
target s390x
|
||||
target riscv64
|
||||
;; FIXME: needs support for vectors
|
||||
;;target riscv64
|
||||
|
||||
function %nearest_f32(f32) -> f32 {
|
||||
block0(v0: f32):
|
||||
@@ -149,3 +150,19 @@ block0(v0: f64):
|
||||
; run: %near_is_nan_f64(-sNaN:0x1) == 1
|
||||
; run: %near_is_nan_f64(+sNaN:0x4000000000001) == 1
|
||||
; run: %near_is_nan_f64(-sNaN:0x4000000000001) == 1
|
||||
|
||||
function %nearest_f32x4(f32x4) -> f32x4 {
|
||||
block0(v0: f32x4):
|
||||
v1 = nearest v0
|
||||
return v1
|
||||
}
|
||||
; run: %nearest_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.8p1]
|
||||
; run: %nearest_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.8p1]
|
||||
|
||||
function %nearest_f64x2(f64x2) -> f64x2 {
|
||||
block0(v0: f64x2):
|
||||
v1 = nearest v0
|
||||
return v1
|
||||
}
|
||||
; run: %nearest_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0]
|
||||
; run: %nearest_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]
|
||||
|
||||
@@ -2,9 +2,12 @@ test interpret
|
||||
test run
|
||||
target x86_64
|
||||
target x86_64 has_sse41=false
|
||||
set enable_simd
|
||||
target x86_64 sse42 has_avx
|
||||
target aarch64
|
||||
target s390x
|
||||
target riscv64
|
||||
;; FIXME: needs support for vectors
|
||||
;;target riscv64
|
||||
|
||||
function %trunc_f32(f32) -> f32 {
|
||||
block0(v0: f32):
|
||||
@@ -147,3 +150,19 @@ block0(v0: f64):
|
||||
; run: %trunc_is_nan_f64(-sNaN:0x1) == 1
|
||||
; run: %trunc_is_nan_f64(+sNaN:0x4000000000001) == 1
|
||||
; run: %trunc_is_nan_f64(-sNaN:0x4000000000001) == 1
|
||||
|
||||
function %trunc_f32x4(f32x4) -> f32x4 {
|
||||
block0(v0: f32x4):
|
||||
v1 = trunc v0
|
||||
return v1
|
||||
}
|
||||
; run: %trunc_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1]
|
||||
; run: %trunc_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1]
|
||||
|
||||
function %trunc_f64x2(f64x2) -> f64x2 {
|
||||
block0(v0: f64x2):
|
||||
v1 = trunc v0
|
||||
return v1
|
||||
}
|
||||
; run: %trunc_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0]
|
||||
; run: %trunc_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]
|
||||
|
||||
Reference in New Issue
Block a user