x64: Port fdemote and fvdemote to ISLE (#4449)

https://github.com/bytecodealliance/wasmtime/pull/4449
2022-07-18 14:26:23 -07:00
parent 638dc4e0b3
commit b519c975cb
4 changed files with 115 additions and 37 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -2841,6 +2841,13 @@
            (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtss2sd) x dst))))
        dst))

+;; Helper for creating `cvtsd2ss` instructions.
+(decl x64_cvtsd2ss (Xmm) Xmm)
+(rule (x64_cvtsd2ss x)
+      (let ((dst WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtsd2ss) x dst))))
+        dst))
+
 ;; Helper for creating `cvtps2pd` instructions.
 (decl x64_cvtps2pd (Xmm) Xmm)
 (rule (x64_cvtps2pd x)
@@ -2848,6 +2855,13 @@
           (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtps2pd) x dst))))
        dst))

+;; Helper for creating `cvtpd2ps` instructions.
+(decl x64_cvtpd2ps (Xmm) Xmm)
+(rule (x64_cvtpd2ps x)
+      (let ((dst WritableXmm (temp_writable_xmm))
+           (_ Unit (emit (MInst.XmmUnaryRmR (SseOpcode.Cvtpd2ps) x dst))))
+        dst))
+
 ;; Helpers for creating `pcmpeq*` instructions.
 (decl x64_pcmpeq (Type Xmm XmmMem) Xmm)
 (rule (x64_pcmpeq $I8X16 x y) (x64_pcmpeqb x y))
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2351,6 +2351,14 @@
 (rule (lower (has_type $F64X2 (fvpromote_low x)))
      (x64_cvtps2pd (put_in_xmm x)))

+;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type $F32 (fdemote x)))
+      (x64_cvtsd2ss x))
+
+;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower (has_type $F32X4 (fvdemote x)))
+      (x64_cvtpd2ps x))
+
 ;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (has_type $F32 (fmin x y)))
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -893,13 +893,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::Fmin
        | Opcode::Fmax
        | Opcode::FminPseudo
-        | Opcode::FmaxPseudo => implemented_in_isle(ctx),
-
-        Opcode::Icmp => {
-            implemented_in_isle(ctx);
-        }
-
-        Opcode::Fcmp => {
+        | Opcode::FmaxPseudo
+        | Opcode::Sqrt
+        | Opcode::Fpromote
+        | Opcode::FvpromoteLow
+        | Opcode::Fdemote
+        | Opcode::Fvdemote
+        | Opcode::Icmp
+        | Opcode::Fcmp => {
            implemented_in_isle(ctx);
        }

@@ -1020,36 +1021,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            };
        }

-        Opcode::Sqrt => {
-            implemented_in_isle(ctx);
-        }
-
-        Opcode::Fpromote => {
-            implemented_in_isle(ctx);
-        }
-
-        Opcode::FvpromoteLow => {
-            implemented_in_isle(ctx);
-        }
-
-        Opcode::Fdemote => {
-            // We can't guarantee the RHS (if a load) is 128-bit aligned, so we
-            // must avoid merging a load here.
-            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
-        }
-
-        Opcode::Fvdemote => {
-            let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            ctx.emit(Inst::xmm_unary_rm_r(
-                SseOpcode::Cvtpd2ps,
-                RegMem::from(src),
-                dst,
-            ));
-        }
-
        Opcode::FcvtFromSint => {
            let output_ty = ty.unwrap();
            if !output_ty.is_vector() {
--- a/cranelift/filetests/filetests/runtests/conversions.clif
+++ b/cranelift/filetests/filetests/runtests/conversions.clif
@@ -0,0 +1,85 @@
+test run
+
+target x86_64
+target aarch64
+
+function %fpromote_f32_f64(i64 vmctx, i64, f32) -> f64 {
+    gv0 = vmctx
+    gv1 = load.i64 notrap aligned gv0+0
+    heap0 = static gv1, min 0x10, bound 0x10, offset_guard 0x0, index_type i64
+
+block0(v0: i64, v1: i64, v2: f32):
+    v3 = heap_addr.i64 heap0, v1, 4
+    store.f32 v2, v3
+    v4 = load.f32 v3
+    v5 = fpromote.f64 v4
+    return v5
+}
+
+; heap: static, size=0x10, ptr=vmctx+0, bound=vmctx+8
+; run: %fpromote_f32_f64(0, 0x0.0) == 0x0.0
+; run: %fpromote_f32_f64(1, 0x0.1) == 0x0.1
+; run: %fpromote_f32_f64(2, 0x0.2) == 0x0.2
+; run: %fpromote_f32_f64(3, 0x3.2) == 0x3.2
+; run: %fpromote_f32_f64(0xc, 0x3.2) == 0x3.2
+
+function %fdemote_test(i64 vmctx, i64, f64) -> f32 {
+    gv0 = vmctx
+    gv1 = load.i64 notrap aligned gv0+0
+    heap0 = static gv1, min 0x10, bound 0x10, offset_guard 0x0, index_type i64
+
+block0(v0: i64, v1: i64, v2: f64):
+    v3 = heap_addr.i64 heap0, v1, 8
+    store.f64 v2, v3
+    v4 = load.f64 v3
+    v5 = fdemote.f32 v4
+    return v5
+}
+
+; heap: static, size=0x10, ptr=vmctx+0, bound=vmctx+8
+; run: %fdemote_test(0, 0x0.0) == 0x0.0
+; run: %fdemote_test(1, 0x0.1) == 0x0.1
+; run: %fdemote_test(2, 0x0.2) == 0x0.2
+; run: %fdemote_test(3, 0x3.2) == 0x3.2
+; run: %fdemote_test(0x8, 0x3.2) == 0x3.2
+
+function %fvdemote_test(i64 vmctx, i64, f64x2) -> f32x4 {
+    gv0 = vmctx
+    gv1 = load.i64 notrap aligned gv0+0
+    heap0 = static gv1, min 0x20, bound 0x20, offset_guard 0, index_type i64
+
+block0(v0: i64, v1: i64, v2: f64x2):
+    v3 = heap_addr.i64 heap0, v1, 16
+    store.f64x2 v2, v3
+    v4 = load.f64x2 v3
+    v5 = fvdemote v4
+    return v5
+}
+
+; heap: static, size=0x20, ptr=vmctx+0, bound=vmctx+8
+; run: %fvdemote_test(0, [0x0.0 0x0.0]) == [0x0.0 0x0.0 0x0.0 0x0.0]
+; run: %fvdemote_test(1, [0x0.1 0x0.2]) == [0x0.1 0x0.2 0x0.0 0x0.0]
+; run: %fvdemote_test(2, [0x2.1 0x1.2]) == [0x2.1 0x1.2 0x0.0 0x0.0]
+; run: %fvdemote_test(8, [0x2.1 0x1.2]) == [0x2.1 0x1.2 0x0.0 0x0.0]
+; run: %fvdemote_test(16, [0x2.1 0x1.2]) == [0x2.1 0x1.2 0x0.0 0x0.0]
+
+
+function %fvpromote_low_test(i64 vmctx, i64, f32x4) -> f64x2 {
+    gv0 = vmctx
+    gv1 = load.i64 notrap aligned gv0+0
+    heap0 = static gv1, min 0x20, bound 0x20, offset_guard 0, index_type i64
+
+block0(v0: i64, v1: i64, v2: f32x4):
+    v3 = heap_addr.i64 heap0, v1, 16
+    store.f32x4 v2, v3
+    v4 = load.f32x4 v3
+    v5 = fvpromote_low v4
+    return v5
+}
+
+; heap: static, size=0x20, ptr=vmctx+0, bound=vmctx+8
+; run: %fvpromote_low_test(0, [0x0.0 0x0.0 0x0.0 0x0.0]) == [0x0.0 0x0.0]
+; run: %fvpromote_low_test(1, [0x0.1 0x0.2 0x0.0 0x0.0]) == [0x0.1 0x0.2]
+; run: %fvpromote_low_test(2, [0x2.1 0x1.2 0x0.0 0x0.0]) == [0x2.1 0x1.2]
+; run: %fvpromote_low_test(5, [0x0.0 0x0.0 0x2.1 0x1.2]) == [0x0.0 0x0.0]
+; run: %fvpromote_low_test(16, [0x0.0 0x0.0 0x2.1 0x1.2]) == [0x0.0 0x0.0]