From 62f8928bee1df2df8163f5ca7478d7e8d0f37e6b Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Tue, 18 Apr 2023 12:23:18 -0500 Subject: [PATCH] x64: Add non-SSE4.1 lowerings of ceil/trunc/floor/nearest (#6224) * x64: Add non-SSE4.1 lowerings of ceil/trunc/floor/nearest This commit adds lowerings that work with SSE2 for CLIF `ceil`, `trunc`, `floor`, and `nearest` instructions over vectors. To get these working `insertlane` for float vectors was also implemented for non-SSE4.1 instructions as well. Note that the goal of these lowerings is not speed but rather "it works", so the decompose-to-call-libcalls logic for vector is probably horrendously slow but should at least be correct. * Skip new tests on riscv64 * Update cranelift/codegen/src/isa/x64/inst.isle Co-authored-by: Andrew Brown --------- Co-authored-by: Andrew Brown --- cranelift/codegen/src/isa/x64/inst.isle | 13 +- cranelift/codegen/src/isa/x64/lower.isle | 179 +++++++++--------- .../filetests/filetests/runtests/ceil.clif | 21 +- .../filetests/filetests/runtests/floor.clif | 21 +- .../filetests/filetests/runtests/nearest.clif | 21 +- .../filetests/filetests/runtests/trunc.clif | 21 +- 6 files changed, 180 insertions(+), 96 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 0bd0d859ff..08d5c45846 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -3119,9 +3119,9 @@ (if-let $true (use_avx_simd)) (xmm_rmr_imm_vex (AvxOpcode.Vpblendw) src1 src2 imm)) -;; Helper for creating a `movsd` instruction which creates a new vector -;; register where the upper 64-bits are from the first operand and the low -;; 64-bits are from the second operand. +;; Helper for creating `movsd`/`movss` instructions which create a new vector +;; register where the upper bits are from the first operand and the low +;; bits are from the second operand. ;; ;; Note that the second argument here is specifically `Xmm` instead of `XmmMem` ;; because there is no encoding of a 3-operand form of `movsd` and otherwise @@ -3134,6 +3134,13 @@ (if-let $true (use_avx_simd)) (xmm_rmir_vex (AvxOpcode.Vmovsd) src1 src2)) +(decl x64_movss_regmove (Xmm Xmm) Xmm) +(rule (x64_movss_regmove src1 src2) + (xmm_rm_r_unaligned (SseOpcode.Movss) src1 src2)) +(rule 1 (x64_movss_regmove src1 src2) + (if-let $true (use_avx_simd)) + (xmm_rmir_vex (AvxOpcode.Vmovss) src1 src2)) + ;; Helper for creating `movlhps` instructions. (decl x64_movlhps (Xmm XmmMem) Xmm) (rule 0 (x64_movlhps src1 src2) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 255f73bb82..0018d30946 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1333,8 +1333,38 @@ (x64_pinsrq vec val idx)) ;; f32x4.replace_lane -(rule (vec_insert_lane $F32X4 vec val idx) - (x64_insertps vec val (sse_insertps_lane_imm idx))) +(rule 1 (vec_insert_lane $F32X4 vec val idx) + (if-let $true (use_sse41)) + (x64_insertps vec val (sse_insertps_lane_imm idx))) + +;; f32x4.replace_lane 0 - without insertps +(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 0) + (x64_movss_regmove vec val)) + +;; f32x4.replace_lane 1 - without insertps +;; tmp = [ vec[1] vec[0] val[1] val[0] ] +;; result = [ vec[3] vec[2] tmp[0] tmp[2] ] +(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 1) + (let ((tmp Xmm (x64_movlhps val vec))) + (x64_shufps tmp vec 0xe2))) ;; 0xe2 == 0b11_10_00_10 + +;; f32x4.replace_lane 2 - without insertps +;; tmp = [ vec[0] vec[3] val[0] val[0] ] +;; result = [ tmp[2] tmp[0] vec[1] vec[0] ] +(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 2) + (let ((tmp Xmm (x64_shufps val vec 0x30))) ;; 0x30 == 0b00_11_00_00 + (x64_shufps vec tmp 0x84))) ;; 0x84 == 0b10_00_01_00 + +;; f32x4.replace_lane 3 - without insertps +;; tmp = [ vec[3] vec[2] val[1] val[0] ] +;; result = [ tmp[0] tmp[2] vec[1] vec[0] ] +(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 3) + (let ((tmp Xmm (x64_shufps val vec 0xe4))) ;; 0xe4 == 0b11_10_01_00 + (x64_shufps vec tmp 0x24))) ;; 0x24 == 0b00_10_01_00 + +;; Recursively delegate to the above rules by loading from memory first. +(rule (vec_insert_lane $F32X4 vec (RegMem.Mem addr) idx) + (vec_insert_lane $F32X4 vec (x64_movss_load addr) idx)) ;; External rust code used to calculate the immediate value to `insertps`. (decl sse_insertps_lane_imm (u8) u8) @@ -3354,101 +3384,78 @@ (x64_andnpd sign_bit a) (x64_andpd sign_bit b)))) +;; Helper for the `ceil`/`floor`/`nearest`/`trunc` instructions ;;;;;;;;;;;;;;;; + +;; Emits either a `round{ss,sd,ps,pd}` instruction, as appropriate, or generates +;; the appropriate libcall and sequence to call that. +(decl x64_round (Type Value RoundImm) Xmm) +(rule 1 (x64_round $F32 a imm) + (if-let $true (use_sse41)) + (x64_roundss a imm)) +(rule 1 (x64_round $F64 a imm) + (if-let $true (use_sse41)) + (x64_roundsd a imm)) +(rule 1 (x64_round $F32X4 a imm) + (if-let $true (use_sse41)) + (x64_roundps a imm)) +(rule 1 (x64_round $F64X2 a imm) + (if-let $true (use_sse41)) + (x64_roundpd a imm)) + +(rule (x64_round $F32 a imm) (libcall_1 (round_libcall $F32 imm) a)) +(rule (x64_round $F64 a imm) (libcall_1 (round_libcall $F64 imm) a)) +(rule (x64_round $F32X4 a imm) + (let ( + (a Xmm a) + (libcall LibCall (round_libcall $F32 imm)) + (result Xmm (libcall_1 libcall a)) + (a1 Xmm (libcall_1 libcall (x64_pshufd a 1))) + (result Xmm (vec_insert_lane $F32X4 result a1 1)) + (a2 Xmm (libcall_1 libcall (x64_pshufd a 2))) + (result Xmm (vec_insert_lane $F32X4 result a2 2)) + (a3 Xmm (libcall_1 libcall (x64_pshufd a 3))) + (result Xmm (vec_insert_lane $F32X4 result a3 3)) + ) + result)) +(rule (x64_round $F64X2 a imm) + (let ( + (a Xmm a) + (libcall LibCall (round_libcall $F64 imm)) + (result Xmm (libcall_1 libcall a)) + (a1 Xmm (libcall_1 libcall (x64_pshufd a 0x0e))) ;; 0x0e == 0b00_00_11_10 + (result Xmm (vec_insert_lane $F64X2 result a1 1)) + ) + result)) + +(decl round_libcall (Type RoundImm) LibCall) +(rule (round_libcall $F32 (RoundImm.RoundUp)) (LibCall.CeilF32)) +(rule (round_libcall $F64 (RoundImm.RoundUp)) (LibCall.CeilF64)) +(rule (round_libcall $F32 (RoundImm.RoundDown)) (LibCall.FloorF32)) +(rule (round_libcall $F64 (RoundImm.RoundDown)) (LibCall.FloorF64)) +(rule (round_libcall $F32 (RoundImm.RoundNearest)) (LibCall.NearestF32)) +(rule (round_libcall $F64 (RoundImm.RoundNearest)) (LibCall.NearestF64)) +(rule (round_libcall $F32 (RoundImm.RoundZero)) (LibCall.TruncF32)) +(rule (round_libcall $F64 (RoundImm.RoundZero)) (LibCall.TruncF64)) + ;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 1 (lower (ceil a @ (value_type $F32))) - (if-let $true (use_sse41)) - (x64_roundss a (RoundImm.RoundUp))) - -(rule 1 (lower (ceil a @ (value_type $F64))) - (if-let $true (use_sse41)) - (x64_roundsd a (RoundImm.RoundUp))) - -(rule 1 (lower (ceil a @ (value_type $F32X4))) - (if-let $true (use_sse41)) - (x64_roundps a (RoundImm.RoundUp))) - -(rule 1 (lower (ceil a @ (value_type $F64X2))) - (if-let $true (use_sse41)) - (x64_roundpd a (RoundImm.RoundUp))) - -(rule (lower (ceil a @ (value_type $F32))) - (libcall_1 (LibCall.CeilF32) a)) - -(rule (lower (ceil a @ (value_type $F64))) - (libcall_1 (LibCall.CeilF64) a)) +(rule (lower (ceil a @ (value_type ty))) + (x64_round ty a (RoundImm.RoundUp))) ;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 1 (lower (floor a @ (value_type $F32))) - (if-let $true (use_sse41)) - (x64_roundss a (RoundImm.RoundDown))) - -(rule 1 (lower (floor a @ (value_type $F64))) - (if-let $true (use_sse41)) - (x64_roundsd a (RoundImm.RoundDown))) - -(rule 1 (lower (floor a @ (value_type $F32X4))) - (if-let $true (use_sse41)) - (x64_roundps a (RoundImm.RoundDown))) - -(rule 1 (lower (floor a @ (value_type $F64X2))) - (if-let $true (use_sse41)) - (x64_roundpd a (RoundImm.RoundDown))) - -(rule (lower (floor a @ (value_type $F32))) - (libcall_1 (LibCall.FloorF32) a)) - -(rule (lower (floor a @ (value_type $F64))) - (libcall_1 (LibCall.FloorF64) a)) +(rule (lower (floor a @ (value_type ty))) + (x64_round ty a (RoundImm.RoundDown))) ;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 1 (lower (nearest a @ (value_type $F32))) - (if-let $true (use_sse41)) - (x64_roundss a (RoundImm.RoundNearest))) - -(rule 1 (lower (nearest a @ (value_type $F64))) - (if-let $true (use_sse41)) - (x64_roundsd a (RoundImm.RoundNearest))) - -(rule 1 (lower (nearest a @ (value_type $F32X4))) - (if-let $true (use_sse41)) - (x64_roundps a (RoundImm.RoundNearest))) - -(rule 1 (lower (nearest a @ (value_type $F64X2))) - (if-let $true (use_sse41)) - (x64_roundpd a (RoundImm.RoundNearest))) - -(rule (lower (nearest a @ (value_type $F32))) - (libcall_1 (LibCall.NearestF32) a)) - -(rule (lower (nearest a @ (value_type $F64))) - (libcall_1 (LibCall.NearestF64) a)) +(rule (lower (nearest a @ (value_type ty))) + (x64_round ty a (RoundImm.RoundNearest))) ;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 1 (lower (trunc a @ (value_type $F32))) - (if-let $true (use_sse41)) - (x64_roundss a (RoundImm.RoundZero))) - -(rule 1 (lower (trunc a @ (value_type $F64))) - (if-let $true (use_sse41)) - (x64_roundsd a (RoundImm.RoundZero))) - -(rule 1 (lower (trunc a @ (value_type $F32X4))) - (if-let $true (use_sse41)) - (x64_roundps a (RoundImm.RoundZero))) - -(rule 1 (lower (trunc a @ (value_type $F64X2))) - (if-let $true (use_sse41)) - (x64_roundpd a (RoundImm.RoundZero))) - -(rule (lower (trunc a @ (value_type $F32))) - (libcall_1 (LibCall.TruncF32) a)) - -(rule (lower (trunc a @ (value_type $F64))) - (libcall_1 (LibCall.TruncF64) a)) +(rule (lower (trunc a @ (value_type ty))) + (x64_round ty a (RoundImm.RoundZero))) ;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/filetests/filetests/runtests/ceil.clif b/cranelift/filetests/filetests/runtests/ceil.clif index cda083705c..8596e3ad91 100644 --- a/cranelift/filetests/filetests/runtests/ceil.clif +++ b/cranelift/filetests/filetests/runtests/ceil.clif @@ -3,10 +3,11 @@ test run target x86_64 target x86_64 has_sse41=false set enable_simd -target x86_64 has_avx +target x86_64 sse42 has_avx target aarch64 target s390x -target riscv64 +;; FIXME: needs support for vectors +;;target riscv64 function %ceil_f32(f32) -> f32 { block0(v0: f32): @@ -149,3 +150,19 @@ block0(v0: f64): ; run: %ceil_is_nan_f64(-sNaN:0x1) == 1 ; run: %ceil_is_nan_f64(+sNaN:0x4000000000001) == 1 ; run: %ceil_is_nan_f64(-sNaN:0x4000000000001) == 1 + +function %ceil_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = ceil v0 + return v1 +} +; run: %ceil_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x1.0 0x1.0 0x1.0p1 0x1.8p1] +; run: %ceil_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1] + +function %ceil_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = ceil v0 + return v1 +} +; run: %ceil_f64x2([0x0.5 0x1.0]) == [0x1.0 0x1.0] +; run: %ceil_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/floor.clif b/cranelift/filetests/filetests/runtests/floor.clif index 0c8904fe5d..d862b3abc2 100644 --- a/cranelift/filetests/filetests/runtests/floor.clif +++ b/cranelift/filetests/filetests/runtests/floor.clif @@ -3,10 +3,11 @@ test run target x86_64 target x86_64 has_sse41=false set enable_simd -target x86_64 has_avx +target x86_64 sse42 has_avx target aarch64 target s390x -target riscv64 +;; FIXME: needs support for vectors +;;target riscv64 function %floor_f32(f32) -> f32 { block0(v0: f32): @@ -149,3 +150,19 @@ block0(v0: f64): ; run: %floor_is_nan_f64(-sNaN:0x1) == 1 ; run: %floor_is_nan_f64(+sNaN:0x4000000000001) == 1 ; run: %floor_is_nan_f64(-sNaN:0x4000000000001) == 1 + +function %floor_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = floor v0 + return v1 +} +; run: %floor_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1] +; run: %floor_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x1.0 -0x1.0 -0x1.0p1 -0x1.8p1] + +function %floor_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = floor v0 + return v1 +} +; run: %floor_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0] +; run: %floor_f64x2([-0x0.5 -0x1.0]) == [-0x1.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/nearest.clif b/cranelift/filetests/filetests/runtests/nearest.clif index 71c2eec0c4..f5ece4a1f2 100644 --- a/cranelift/filetests/filetests/runtests/nearest.clif +++ b/cranelift/filetests/filetests/runtests/nearest.clif @@ -3,10 +3,11 @@ test run target x86_64 target x86_64 has_sse41=false set enable_simd -target x86_64 has_avx +target x86_64 sse42 has_avx target aarch64 target s390x -target riscv64 +;; FIXME: needs support for vectors +;;target riscv64 function %nearest_f32(f32) -> f32 { block0(v0: f32): @@ -149,3 +150,19 @@ block0(v0: f64): ; run: %near_is_nan_f64(-sNaN:0x1) == 1 ; run: %near_is_nan_f64(+sNaN:0x4000000000001) == 1 ; run: %near_is_nan_f64(-sNaN:0x4000000000001) == 1 + +function %nearest_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = nearest v0 + return v1 +} +; run: %nearest_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.8p1] +; run: %nearest_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.8p1] + +function %nearest_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = nearest v0 + return v1 +} +; run: %nearest_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0] +; run: %nearest_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0] diff --git a/cranelift/filetests/filetests/runtests/trunc.clif b/cranelift/filetests/filetests/runtests/trunc.clif index f0b427c917..f4a377b42e 100644 --- a/cranelift/filetests/filetests/runtests/trunc.clif +++ b/cranelift/filetests/filetests/runtests/trunc.clif @@ -2,9 +2,12 @@ test interpret test run target x86_64 target x86_64 has_sse41=false +set enable_simd +target x86_64 sse42 has_avx target aarch64 target s390x -target riscv64 +;; FIXME: needs support for vectors +;;target riscv64 function %trunc_f32(f32) -> f32 { block0(v0: f32): @@ -147,3 +150,19 @@ block0(v0: f64): ; run: %trunc_is_nan_f64(-sNaN:0x1) == 1 ; run: %trunc_is_nan_f64(+sNaN:0x4000000000001) == 1 ; run: %trunc_is_nan_f64(-sNaN:0x4000000000001) == 1 + +function %trunc_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = trunc v0 + return v1 +} +; run: %trunc_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1] +; run: %trunc_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1] + +function %trunc_f64x2(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = trunc v0 + return v1 +} +; run: %trunc_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0] +; run: %trunc_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]