Cranelift AArch64: Improve code generation for vector constants

In particular, introduce initial support for the MOVI and MVNI instructions, with 8-bit elements. Also, treat vector constants as 32- or 64-bit floating-point numbers, if their value allows it, by relying on the architectural zero extension. Finally, stop generating literal loads for 32-bit constants. Copyright (c) 2020, Arm Limited.
2020-10-14 13:04:08 +01:00
parent 7b43bf76ed
commit 207779fe1d
12 changed files with 549 additions and 164 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -813,7 +813,11 @@ pub(crate) fn lower_constant_f32<C: LowerCtx<I = Inst>>(
    rd: Writable<Reg>,
    value: f32,
 ) {
-    ctx.emit(Inst::load_fp_constant32(rd, value));
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_fp_constant32(rd, value.to_bits(), alloc_tmp) {
+        ctx.emit(inst);
+    }
 }

 pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
@@ -821,7 +825,11 @@ pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
    rd: Writable<Reg>,
    value: f64,
 ) {
-    ctx.emit(Inst::load_fp_constant64(rd, value));
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_fp_constant64(rd, value.to_bits(), alloc_tmp) {
+        ctx.emit(inst);
+    }
 }

 pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
@@ -829,7 +837,38 @@ pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
    rd: Writable<Reg>,
    value: u128,
 ) {
-    ctx.emit(Inst::load_fp_constant128(rd, value));
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
+        ctx.emit(inst);
+    }
+}
+
+pub(crate) fn lower_splat_const<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    value: u64,
+    size: VectorSize,
+) {
+    let (value, narrow_size) = match size.lane_size() {
+        ScalarSize::Size8 => (value as u8 as u64, ScalarSize::Size128),
+        ScalarSize::Size16 => (value as u16 as u64, ScalarSize::Size8),
+        ScalarSize::Size32 => (value as u32 as u64, ScalarSize::Size16),
+        ScalarSize::Size64 => (value, ScalarSize::Size32),
+        _ => unreachable!(),
+    };
+    let (value, size) = match Inst::get_replicated_vector_pattern(value as u128, narrow_size) {
+        Some((value, lane_size)) => (
+            value,
+            VectorSize::from_lane_size(lane_size, size.is_128bits()),
+        ),
+        None => (value, size),
+    };
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_replicated_vector_pattern(rd, value, size, alloc_tmp) {
+        ctx.emit(inst);
+    }
 }

 pub(crate) fn lower_condcode(cc: IntCC) -> Cond {