From 62f8928bee1df2df8163f5ca7478d7e8d0f37e6b Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Tue, 18 Apr 2023 12:23:18 -0500
Subject: [PATCH] x64: Add non-SSE4.1 lowerings of ceil/trunc/floor/nearest
 (#6224)

* x64: Add non-SSE4.1 lowerings of ceil/trunc/floor/nearest

This commit adds lowerings that work with SSE2 for CLIF `ceil`, `trunc`,
`floor`, and `nearest` instructions over vectors. To get these working
`insertlane` for float vectors was also implemented for non-SSE4.1
instructions as well.

Note that the goal of these lowerings is not speed but rather "it
works", so the decompose-to-call-libcalls logic for vector is probably
horrendously slow but should at least be correct.

* Skip new tests on riscv64

* Update cranelift/codegen/src/isa/x64/inst.isle

Co-authored-by: Andrew Brown <andrew.brown@intel.com>

---------

Co-authored-by: Andrew Brown <andrew.brown@intel.com>
---
 cranelift/codegen/src/isa/x64/inst.isle       |  13 +-
 cranelift/codegen/src/isa/x64/lower.isle      | 179 +++++++++---------
 .../filetests/filetests/runtests/ceil.clif    |  21 +-
 .../filetests/filetests/runtests/floor.clif   |  21 +-
 .../filetests/filetests/runtests/nearest.clif |  21 +-
 .../filetests/filetests/runtests/trunc.clif   |  21 +-
 6 files changed, 180 insertions(+), 96 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index 0bd0d859ff..08d5c45846 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -3119,9 +3119,9 @@
       (if-let $true (use_avx_simd))
       (xmm_rmr_imm_vex (AvxOpcode.Vpblendw) src1 src2 imm))
 
-;; Helper for creating a `movsd` instruction which creates a new vector
-;; register where the upper 64-bits are from the first operand and the low
-;; 64-bits are from the second operand.
+;; Helper for creating `movsd`/`movss` instructions which create a new vector
+;; register where the upper bits are from the first operand and the low
+;; bits are from the second operand.
 ;;
 ;; Note that the second argument here is specifically `Xmm` instead of `XmmMem`
 ;; because there is no encoding of a 3-operand form of `movsd` and otherwise
@@ -3134,6 +3134,13 @@
         (if-let $true (use_avx_simd))
         (xmm_rmir_vex (AvxOpcode.Vmovsd) src1 src2))
 
+(decl x64_movss_regmove (Xmm Xmm) Xmm)
+(rule (x64_movss_regmove src1 src2)
+      (xmm_rm_r_unaligned (SseOpcode.Movss) src1 src2))
+(rule 1 (x64_movss_regmove src1 src2)
+        (if-let $true (use_avx_simd))
+        (xmm_rmir_vex (AvxOpcode.Vmovss) src1 src2))
+
 ;; Helper for creating `movlhps` instructions.
 (decl x64_movlhps (Xmm XmmMem) Xmm)
 (rule 0 (x64_movlhps src1 src2)
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 255f73bb82..0018d30946 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1333,8 +1333,38 @@
       (x64_pinsrq vec val idx))
 
 ;; f32x4.replace_lane
-(rule (vec_insert_lane $F32X4 vec val idx)
-      (x64_insertps vec val (sse_insertps_lane_imm idx)))
+(rule 1 (vec_insert_lane $F32X4 vec val idx)
+        (if-let $true (use_sse41))
+        (x64_insertps vec val (sse_insertps_lane_imm idx)))
+
+;; f32x4.replace_lane 0 - without insertps
+(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 0)
+      (x64_movss_regmove vec val))
+
+;; f32x4.replace_lane 1 - without insertps
+;; tmp    = [ vec[1] vec[0] val[1] val[0] ]
+;; result = [ vec[3] vec[2] tmp[0] tmp[2] ]
+(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 1)
+      (let ((tmp Xmm (x64_movlhps val vec)))
+        (x64_shufps tmp vec 0xe2))) ;; 0xe2 == 0b11_10_00_10
+
+;; f32x4.replace_lane 2 - without insertps
+;; tmp    = [ vec[0] vec[3] val[0] val[0] ]
+;; result = [ tmp[2] tmp[0] vec[1] vec[0] ]
+(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 2)
+      (let ((tmp Xmm (x64_shufps val vec 0x30)))  ;; 0x30 == 0b00_11_00_00
+        (x64_shufps vec tmp 0x84)))               ;; 0x84 == 0b10_00_01_00
+
+;; f32x4.replace_lane 3 - without insertps
+;; tmp    = [ vec[3] vec[2] val[1] val[0] ]
+;; result = [ tmp[0] tmp[2] vec[1] vec[0] ]
+(rule (vec_insert_lane $F32X4 vec (RegMem.Reg val) 3)
+      (let ((tmp Xmm (x64_shufps val vec 0xe4)))  ;; 0xe4 == 0b11_10_01_00
+        (x64_shufps vec tmp 0x24)))               ;; 0x24 == 0b00_10_01_00
+
+;; Recursively delegate to the above rules by loading from memory first.
+(rule (vec_insert_lane $F32X4 vec (RegMem.Mem addr) idx)
+      (vec_insert_lane $F32X4 vec (x64_movss_load addr) idx))
 
 ;; External rust code used to calculate the immediate value to `insertps`.
 (decl sse_insertps_lane_imm (u8) u8)
@@ -3354,101 +3384,78 @@
           (x64_andnpd sign_bit a)
           (x64_andpd sign_bit b))))
 
+;; Helper for the `ceil`/`floor`/`nearest`/`trunc` instructions ;;;;;;;;;;;;;;;;
+
+;; Emits either a `round{ss,sd,ps,pd}` instruction, as appropriate, or generates
+;; the appropriate libcall and sequence to call that.
+(decl x64_round (Type Value RoundImm) Xmm)
+(rule 1 (x64_round $F32 a imm)
+        (if-let $true (use_sse41))
+        (x64_roundss a imm))
+(rule 1 (x64_round $F64 a imm)
+        (if-let $true (use_sse41))
+        (x64_roundsd a imm))
+(rule 1 (x64_round $F32X4 a imm)
+        (if-let $true (use_sse41))
+        (x64_roundps a imm))
+(rule 1 (x64_round $F64X2 a imm)
+        (if-let $true (use_sse41))
+        (x64_roundpd a imm))
+
+(rule (x64_round $F32 a imm) (libcall_1 (round_libcall $F32 imm) a))
+(rule (x64_round $F64 a imm) (libcall_1 (round_libcall $F64 imm) a))
+(rule (x64_round $F32X4 a imm)
+      (let (
+          (a Xmm a)
+          (libcall LibCall (round_libcall $F32 imm))
+          (result Xmm (libcall_1 libcall a))
+          (a1 Xmm (libcall_1 libcall (x64_pshufd a 1)))
+          (result Xmm (vec_insert_lane $F32X4 result a1 1))
+          (a2 Xmm (libcall_1 libcall (x64_pshufd a 2)))
+          (result Xmm (vec_insert_lane $F32X4 result a2 2))
+          (a3 Xmm (libcall_1 libcall (x64_pshufd a 3)))
+          (result Xmm (vec_insert_lane $F32X4 result a3 3))
+        )
+        result))
+(rule (x64_round $F64X2 a imm)
+      (let (
+          (a Xmm a)
+          (libcall LibCall (round_libcall $F64 imm))
+          (result Xmm (libcall_1 libcall a))
+          (a1 Xmm (libcall_1 libcall (x64_pshufd a 0x0e))) ;; 0x0e == 0b00_00_11_10
+          (result Xmm (vec_insert_lane $F64X2 result a1 1))
+        )
+        result))
+
+(decl round_libcall (Type RoundImm) LibCall)
+(rule (round_libcall $F32 (RoundImm.RoundUp)) (LibCall.CeilF32))
+(rule (round_libcall $F64 (RoundImm.RoundUp)) (LibCall.CeilF64))
+(rule (round_libcall $F32 (RoundImm.RoundDown)) (LibCall.FloorF32))
+(rule (round_libcall $F64 (RoundImm.RoundDown)) (LibCall.FloorF64))
+(rule (round_libcall $F32 (RoundImm.RoundNearest)) (LibCall.NearestF32))
+(rule (round_libcall $F64 (RoundImm.RoundNearest)) (LibCall.NearestF64))
+(rule (round_libcall $F32 (RoundImm.RoundZero)) (LibCall.TruncF32))
+(rule (round_libcall $F64 (RoundImm.RoundZero)) (LibCall.TruncF64))
+
 ;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule 1 (lower (ceil a @ (value_type $F32)))
-        (if-let $true (use_sse41))
-        (x64_roundss a (RoundImm.RoundUp)))
-
-(rule 1 (lower (ceil a @ (value_type $F64)))
-        (if-let $true (use_sse41))
-        (x64_roundsd a (RoundImm.RoundUp)))
-
-(rule 1 (lower (ceil a @ (value_type $F32X4)))
-        (if-let $true (use_sse41))
-        (x64_roundps a (RoundImm.RoundUp)))
-
-(rule 1 (lower (ceil a @ (value_type $F64X2)))
-        (if-let $true (use_sse41))
-        (x64_roundpd a (RoundImm.RoundUp)))
-
-(rule (lower (ceil a @ (value_type $F32)))
-      (libcall_1 (LibCall.CeilF32) a))
-
-(rule (lower (ceil a @ (value_type $F64)))
-      (libcall_1 (LibCall.CeilF64) a))
+(rule (lower (ceil a @ (value_type ty)))
+      (x64_round ty a (RoundImm.RoundUp)))
 
 ;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule 1 (lower (floor a @ (value_type $F32)))
-        (if-let $true (use_sse41))
-        (x64_roundss a (RoundImm.RoundDown)))
-
-(rule 1 (lower (floor a @ (value_type $F64)))
-        (if-let $true (use_sse41))
-        (x64_roundsd a (RoundImm.RoundDown)))
-
-(rule 1 (lower (floor a @ (value_type $F32X4)))
-        (if-let $true (use_sse41))
-        (x64_roundps a (RoundImm.RoundDown)))
-
-(rule 1 (lower (floor a @ (value_type $F64X2)))
-        (if-let $true (use_sse41))
-        (x64_roundpd a (RoundImm.RoundDown)))
-
-(rule (lower (floor a @ (value_type $F32)))
-      (libcall_1 (LibCall.FloorF32) a))
-
-(rule (lower (floor a @ (value_type $F64)))
-      (libcall_1 (LibCall.FloorF64) a))
+(rule (lower (floor a @ (value_type ty)))
+      (x64_round ty a (RoundImm.RoundDown)))
 
 ;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule 1 (lower (nearest a @ (value_type $F32)))
-        (if-let $true (use_sse41))
-        (x64_roundss a (RoundImm.RoundNearest)))
-
-(rule 1 (lower (nearest a @ (value_type $F64)))
-        (if-let $true (use_sse41))
-        (x64_roundsd a (RoundImm.RoundNearest)))
-
-(rule 1 (lower (nearest a @ (value_type $F32X4)))
-        (if-let $true (use_sse41))
-        (x64_roundps a (RoundImm.RoundNearest)))
-
-(rule 1 (lower (nearest a @ (value_type $F64X2)))
-        (if-let $true (use_sse41))
-        (x64_roundpd a (RoundImm.RoundNearest)))
-
-(rule (lower (nearest a @ (value_type $F32)))
-      (libcall_1 (LibCall.NearestF32) a))
-
-(rule (lower (nearest a @ (value_type $F64)))
-      (libcall_1 (LibCall.NearestF64) a))
+(rule (lower (nearest a @ (value_type ty)))
+      (x64_round ty a (RoundImm.RoundNearest)))
 
 ;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule 1 (lower (trunc a @ (value_type $F32)))
-        (if-let $true (use_sse41))
-        (x64_roundss a (RoundImm.RoundZero)))
-
-(rule 1 (lower (trunc a @ (value_type $F64)))
-        (if-let $true (use_sse41))
-        (x64_roundsd a (RoundImm.RoundZero)))
-
-(rule 1 (lower (trunc a @ (value_type $F32X4)))
-        (if-let $true (use_sse41))
-        (x64_roundps a (RoundImm.RoundZero)))
-
-(rule 1 (lower (trunc a @ (value_type $F64X2)))
-        (if-let $true (use_sse41))
-        (x64_roundpd a (RoundImm.RoundZero)))
-
-(rule (lower (trunc a @ (value_type $F32)))
-      (libcall_1 (LibCall.TruncF32) a))
-
-(rule (lower (trunc a @ (value_type $F64)))
-      (libcall_1 (LibCall.TruncF64) a))
+(rule (lower (trunc a @ (value_type ty)))
+      (x64_round ty a (RoundImm.RoundZero)))
 
 ;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/filetests/filetests/runtests/ceil.clif b/cranelift/filetests/filetests/runtests/ceil.clif
index cda083705c..8596e3ad91 100644
--- a/cranelift/filetests/filetests/runtests/ceil.clif
+++ b/cranelift/filetests/filetests/runtests/ceil.clif
@@ -3,10 +3,11 @@ test run
 target x86_64
 target x86_64 has_sse41=false
 set enable_simd
-target x86_64 has_avx
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target riscv64
+;; FIXME: needs support for vectors
+;;target riscv64
 
 function %ceil_f32(f32) -> f32 {
 block0(v0: f32):
@@ -149,3 +150,19 @@ block0(v0: f64):
 ; run: %ceil_is_nan_f64(-sNaN:0x1) == 1
 ; run: %ceil_is_nan_f64(+sNaN:0x4000000000001) == 1
 ; run: %ceil_is_nan_f64(-sNaN:0x4000000000001) == 1
+
+function %ceil_f32x4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+  v1 = ceil v0
+  return v1
+}
+; run: %ceil_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x1.0 0x1.0 0x1.0p1 0x1.8p1]
+; run: %ceil_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1]
+
+function %ceil_f64x2(f64x2) -> f64x2 {
+block0(v0: f64x2):
+  v1 = ceil v0
+  return v1
+}
+; run: %ceil_f64x2([0x0.5 0x1.0]) == [0x1.0 0x1.0]
+; run: %ceil_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]
diff --git a/cranelift/filetests/filetests/runtests/floor.clif b/cranelift/filetests/filetests/runtests/floor.clif
index 0c8904fe5d..d862b3abc2 100644
--- a/cranelift/filetests/filetests/runtests/floor.clif
+++ b/cranelift/filetests/filetests/runtests/floor.clif
@@ -3,10 +3,11 @@ test run
 target x86_64
 target x86_64 has_sse41=false
 set enable_simd
-target x86_64 has_avx
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target riscv64
+;; FIXME: needs support for vectors
+;;target riscv64
 
 function %floor_f32(f32) -> f32 {
 block0(v0: f32):
@@ -149,3 +150,19 @@ block0(v0: f64):
 ; run: %floor_is_nan_f64(-sNaN:0x1) == 1
 ; run: %floor_is_nan_f64(+sNaN:0x4000000000001) == 1
 ; run: %floor_is_nan_f64(-sNaN:0x4000000000001) == 1
+
+function %floor_f32x4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+  v1 = floor v0
+  return v1
+}
+; run: %floor_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1]
+; run: %floor_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x1.0 -0x1.0 -0x1.0p1 -0x1.8p1]
+
+function %floor_f64x2(f64x2) -> f64x2 {
+block0(v0: f64x2):
+  v1 = floor v0
+  return v1
+}
+; run: %floor_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0]
+; run: %floor_f64x2([-0x0.5 -0x1.0]) == [-0x1.0 -0x1.0]
diff --git a/cranelift/filetests/filetests/runtests/nearest.clif b/cranelift/filetests/filetests/runtests/nearest.clif
index 71c2eec0c4..f5ece4a1f2 100644
--- a/cranelift/filetests/filetests/runtests/nearest.clif
+++ b/cranelift/filetests/filetests/runtests/nearest.clif
@@ -3,10 +3,11 @@ test run
 target x86_64
 target x86_64 has_sse41=false
 set enable_simd
-target x86_64 has_avx
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target riscv64
+;; FIXME: needs support for vectors
+;;target riscv64
 
 function %nearest_f32(f32) -> f32 {
 block0(v0: f32):
@@ -149,3 +150,19 @@ block0(v0: f64):
 ; run: %near_is_nan_f64(-sNaN:0x1) == 1
 ; run: %near_is_nan_f64(+sNaN:0x4000000000001) == 1
 ; run: %near_is_nan_f64(-sNaN:0x4000000000001) == 1
+
+function %nearest_f32x4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+  v1 = nearest v0
+  return v1
+}
+; run: %nearest_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.8p1]
+; run: %nearest_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.8p1]
+
+function %nearest_f64x2(f64x2) -> f64x2 {
+block0(v0: f64x2):
+  v1 = nearest v0
+  return v1
+}
+; run: %nearest_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0]
+; run: %nearest_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]
diff --git a/cranelift/filetests/filetests/runtests/trunc.clif b/cranelift/filetests/filetests/runtests/trunc.clif
index f0b427c917..f4a377b42e 100644
--- a/cranelift/filetests/filetests/runtests/trunc.clif
+++ b/cranelift/filetests/filetests/runtests/trunc.clif
@@ -2,9 +2,12 @@ test interpret
 test run
 target x86_64
 target x86_64 has_sse41=false
+set enable_simd
+target x86_64 sse42 has_avx
 target aarch64
 target s390x
-target riscv64
+;; FIXME: needs support for vectors
+;;target riscv64
 
 function %trunc_f32(f32) -> f32 {
 block0(v0: f32):
@@ -147,3 +150,19 @@ block0(v0: f64):
 ; run: %trunc_is_nan_f64(-sNaN:0x1) == 1
 ; run: %trunc_is_nan_f64(+sNaN:0x4000000000001) == 1
 ; run: %trunc_is_nan_f64(-sNaN:0x4000000000001) == 1
+
+function %trunc_f32x4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+  v1 = trunc v0
+  return v1
+}
+; run: %trunc_f32x4([0x0.5 0x1.0 0x1.5 0x2.9]) == [0x0.0 0x1.0 0x1.0 0x1.0p1]
+; run: %trunc_f32x4([-0x0.5 -0x1.0 -0x1.5 -0x2.9]) == [-0x0.0 -0x1.0 -0x1.0 -0x1.0p1]
+
+function %trunc_f64x2(f64x2) -> f64x2 {
+block0(v0: f64x2):
+  v1 = trunc v0
+  return v1
+}
+; run: %trunc_f64x2([0x0.5 0x1.0]) == [0x0.0 0x1.0]
+; run: %trunc_f64x2([-0x0.5 -0x1.0]) == [-0x0.0 -0x1.0]