From db7f9ccd2b1d606e608e2cbc989e5856736f1c9c Mon Sep 17 00:00:00 2001
From: Damian Heaton <87125748+dheaton-arm@users.noreply.github.com>
Date: Mon, 18 Jul 2022 19:11:54 +0100
Subject: [PATCH] Convert `scalar_to_vector` to ISLE (AArch64) (#4401)

* Convert `scalar_to_vector` to ISLE (AArch64)

Converted the exisiting implementation of `scalar_to_vector` for AArch64 to
ISLE.

Copyright (c) 2022 Arm Limited

* Add support for floats and fix FpuExtend

- Added rules to cover `f32 -> f32x4` and `f64 -> f64x2` for
`scalar_to_vector`
- Added tests for `scalar_to_vector` on floats.
- Corrected an invalid instruction emitted by `FpuExtend` on 64-bit
values.

Copyright (c) 2022 Arm Limited
---
 cranelift/codegen/src/isa/aarch64/inst.isle   |  7 ++++
 .../codegen/src/isa/aarch64/inst/emit.rs      |  2 +-
 .../src/isa/aarch64/inst/emit_tests.rs        | 10 +++++
 cranelift/codegen/src/isa/aarch64/lower.isle  | 14 +++++++
 .../codegen/src/isa/aarch64/lower_inst.rs     | 20 +--------
 cranelift/codegen/src/machinst/isle.rs        |  8 ++++
 cranelift/codegen/src/prelude.isle            |  4 ++
 .../filetests/isa/aarch64/simd_load_zero.clif | 34 ++++++++++++---
 .../runtests/simd-scalartovector-aarch64.clif | 19 +++++++++
 .../runtests/simd-scalartovector.clif         | 42 +++++++++++++++++++
 10 files changed, 135 insertions(+), 25 deletions(-)
 create mode 100644 cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif
 create mode 100644 cranelift/filetests/filetests/runtests/simd-scalartovector.clif

diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index 6397ff3c1c..9e211a4c7b 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -1637,6 +1637,13 @@
             (_ Unit (emit (MInst.Extend dst rn signed from_bits to_bits))))
         dst))
 
+;; Helper for emitting `MInst.FpuExtend` instructions.
+(decl fpu_extend (Reg ScalarSize) Reg)
+(rule (fpu_extend src size)
+      (let ((dst WritableReg (temp_writable_reg $F32X4))
+            (_ Unit (emit (MInst.FpuExtend dst src size))))
+        dst))
+
 ;; Helper for emitting `MInst.LoadAcquire` instructions.
 (decl load_acquire (Type Reg) Reg)
 (rule (load_acquire ty addr)
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 7ff0a2f2a2..bfbd121b2d 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1688,7 +1688,7 @@ impl MachInstEmit for Inst {
                 let rd = allocs.next_writable(rd);
                 let rn = allocs.next(rn);
                 sink.put4(enc_fpurr(
-                    0b000_11110_00_1_000000_10000 | (size.ftype() << 13),
+                    0b000_11110_00_1_000000_10000 | (size.ftype() << 12),
                     rd,
                     rn,
                 ));
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index c1a124ec80..66d1d8a776 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -5528,6 +5528,16 @@ fn test_aarch64_binemit() {
         "fmov s31, s0",
     ));
 
+    insns.push((
+        Inst::FpuExtend {
+            rd: writable_vreg(31),
+            rn: vreg(0),
+            size: ScalarSize::Size64,
+        },
+        "1F40601E",
+        "fmov d31, d0",
+    ));
+
     insns.push((
         Inst::FpuRR {
             fpu_op: FPUOp1::Abs,
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index 6d5cc87f92..f215654b1d 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -121,6 +121,20 @@
 (rule (lower (has_type $I128 (iconcat lo hi)))
       (output (value_regs lo hi)))
 
+;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $F32X4 (scalar_to_vector x)))
+      (fpu_extend x (ScalarSize.Size32)))
+
+(rule (lower (has_type $F64X2 (scalar_to_vector x)))
+      (fpu_extend x (ScalarSize.Size64)))
+
+(rule (lower (scalar_to_vector x @ (value_type (ty_int_bool_64 _))))
+      (mov_to_fpu x (ScalarSize.Size64)))
+
+(rule (lower (scalar_to_vector x @ (value_type (int_bool_fits_in_32 _))))
+      (mov_to_fpu (put_in_reg_zext32 x) (ScalarSize.Size32)))
+
 ;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I16X8 (iadd_pairwise (swiden_low x) (swiden_high y))))
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index b407ef3dd9..50f01e9f23 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -816,25 +816,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
-        Opcode::ScalarToVector => {
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let input_ty = ctx.input_ty(insn, 0);
-            if (input_ty == I32 && ty.unwrap() == I32X4)
-                || (input_ty == I64 && ty.unwrap() == I64X2)
-            {
-                ctx.emit(Inst::MovToFpu {
-                    rd,
-                    rn,
-                    size: ScalarSize::from_ty(input_ty),
-                });
-            } else {
-                return Err(CodegenError::Unsupported(format!(
-                    "ScalarToVector: unsupported types {:?} -> {:?}",
-                    input_ty, ty
-                )));
-            }
-        }
+        Opcode::ScalarToVector => implemented_in_isle(ctx),
 
         Opcode::VallTrue if ctx.input_ty(insn, 0).lane_bits() == 64 => {
             let input_ty = ctx.input_ty(insn, 0);
diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs
index 28005863df..c941ead143 100644
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -299,6 +299,14 @@ macro_rules! isle_prelude_methods {
             }
         }
 
+        #[inline]
+        fn int_bool_fits_in_32(&mut self, ty: Type) -> Option<Type> {
+            match ty {
+                I8 | I16 | I32 | B8 | B16 | B32 => Some(ty),
+                _ => None,
+            }
+        }
+
         #[inline]
         fn ty_int_bool_64(&mut self, ty: Type) -> Option<Type> {
             match ty {
diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle
index ccaef32341..62933cd7a1 100644
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -313,6 +313,10 @@
 (decl ty_8_or_16 (Type) Type)
 (extern extractor ty_8_or_16 ty_8_or_16)
 
+;; An extractor that matches int and bool types that fit in 32 bits.
+(decl int_bool_fits_in_32 (Type) Type)
+(extern extractor int_bool_fits_in_32 int_bool_fits_in_32)
+
 ;; An extractor that matches I64 or B64.
 (decl ty_int_bool_64 (Type) Type)
 (extern extractor ty_int_bool_64 ty_int_bool_64)
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif b/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif
index 894ed03775..70ceecd6db 100644
--- a/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif
@@ -10,9 +10,9 @@ block0:
 }
 
 ; block0:
-;   movz x2, #1
-;   movk x2, #1, LSL #48
-;   fmov d0, x2
+;   movz x1, #1
+;   movk x1, #1, LSL #48
+;   fmov d0, x1
 ;   ret
 
 function %f2() -> i32x4 {
@@ -23,7 +23,31 @@ block0:
 }
 
 ; block0:
-;   movz x2, #42679
-;   fmov s0, w2
+;   movz x1, #42679
+;   fmov s0, w1
+;   ret
+
+function %f3() -> f32x4 {
+block0:
+  v0 = f32const 0x1.0
+  v1 = scalar_to_vector.f32x4 v0
+  return v1
+}
+
+; block0:
+;   fmov s1, #1
+;   fmov s0, s1
+;   ret
+
+function %f4() -> f64x2 {
+block0:
+  v0 = f64const 0x1.0
+  v1 = scalar_to_vector.f64x2 v0
+  return v1
+}
+
+; block0:
+;   fmov d1, #1
+;   fmov d0, d1
 ;   ret
 
diff --git a/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif b/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif
new file mode 100644
index 0000000000..5a458f7de6
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-scalartovector-aarch64.clif
@@ -0,0 +1,19 @@
+test run
+target aarch64
+; i8 and i16 are invalid source sizes for x86_64
+
+function %scalartovector_i8(i8) -> i8x16 {
+block0(v0: i8):
+    v1 = scalar_to_vector.i8x16 v0
+    return v1
+}
+; run: %scalartovector_i8(1) == [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+; run: %scalartovector_i8(255) == [255 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+
+function %scalartovector_i16(i16) -> i16x8 {
+block0(v0: i16):
+    v1 = scalar_to_vector.i16x8 v0
+    return v1
+}
+; run: %scalartovector_i16(1) == [1 0 0 0 0 0 0 0]
+; run: %scalartovector_i16(65535) == [65535 0 0 0 0 0 0 0]
\ No newline at end of file
diff --git a/cranelift/filetests/filetests/runtests/simd-scalartovector.clif b/cranelift/filetests/filetests/runtests/simd-scalartovector.clif
new file mode 100644
index 0000000000..37d726f8bf
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-scalartovector.clif
@@ -0,0 +1,42 @@
+test run
+target aarch64
+set enable_simd
+target x86_64 has_sse3 has_ssse3 has_sse41
+
+function %scalartovector_i32(i32) -> i32x4 {
+block0(v0: i32):
+    v1 = scalar_to_vector.i32x4 v0
+    return v1
+}
+; run: %scalartovector_i32(1) == [1 0 0 0]
+; run: %scalartovector_i32(4294967295) == [4294967295 0 0 0]
+
+function %scalartovector_i64(i64) -> i64x2 {
+block0(v0: i64):
+    v1 = scalar_to_vector.i64x2 v0
+    return v1
+}
+; run: %scalartovector_i64(1) == [1 0]
+; run: %scalartovector_i64(18446744073709551615) == [18446744073709551615 0]
+
+function %scalartovector_f32(f32) -> f32x4 {
+block0(v0: f32):
+    v1 = scalar_to_vector.f32x4 v0
+    return v1
+}
+; run: %scalartovector_f32(0x1.0) == [0x1.0 0x0.0 0x0.0 0x0.0]
+; run: %scalartovector_f32(0x0.1) == [0x0.1 0x0.0 0x0.0 0x0.0]
+; run: %scalartovector_f32(NaN) == [NaN 0x0.0 0x0.0 0x0.0]
+; run: %scalartovector_f32(-0x0.0) == [-0x0.0 0x0.0 0x0.0 0x0.0]
+; run: %scalartovector_f32(0x0.0) == [0x0.0 0x0.0 0x0.0 0x0.0]
+
+function %scalartovector_f64(f64) -> f64x2 {
+block0(v0: f64):
+    v1 = scalar_to_vector.f64x2 v0
+    return v1
+}
+; run: %scalartovector_f64(0x1.0) == [0x1.0 0x0.0]
+; run: %scalartovector_f64(0x0.1) == [0x0.1 0x0.0]
+; run: %scalartovector_f64(NaN) == [NaN 0x0.0]
+; run: %scalartovector_f64(-0x0.0) == [-0x0.0 0x0.0]
+; run: %scalartovector_f64(0x0.0) == [0x0.0 0x0.0]