From bb2dd5b68b367df3305a98914eb9baeafa80bc03 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Mon, 7 Dec 2020 13:59:31 -0800 Subject: [PATCH] [machinst x64]: implement load*_zero for x64 --- cranelift/codegen/src/isa/x64/lower.rs | 47 +++++++++++++++++++ .../isa/x64/simd-lane-access-compile.clif | 31 ++++++++++++ 2 files changed, 78 insertions(+) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 6b779a81d8..39d6f9fb57 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -4095,6 +4095,53 @@ fn lower_insn_to_regs>( emit_extract_lane(ctx, src, dst, lane, ty); } + Opcode::ScalarToVector => { + // When moving a scalar value to a vector register, we must be handle several + // situations: + // 1. a scalar float is already in an XMM register, so we simply move it + // 2. a scalar of any other type resides in a GPR register: MOVD moves the bits to an + // XMM register and zeroes the upper bits + // 3. a scalar (float or otherwise) that has previously been loaded from memory (e.g. + // the default lowering of Wasm's `load[32|64]_zero`) can be lowered to a single + // MOVSS/MOVSD instruction; to do this, we rely on `input_to_reg_mem` to sink the + // unused load. + let src = input_to_reg_mem(ctx, inputs[0]); + let src_ty = ctx.input_ty(insn, 0); + let dst = get_output_reg(ctx, outputs[0]); + let dst_ty = ty.unwrap(); + assert!(src_ty == dst_ty.lane_type() && dst_ty.bits() == 128); + match src { + RegMem::Reg { reg } => { + if src_ty.is_float() { + // Case 1: when moving a scalar float, we simply move from one XMM register + // to another, expecting the register allocator to elide this. Here we + // assume that the upper bits of a scalar float have not been munged with + // (the same assumption the old backend makes). + ctx.emit(Inst::gen_move(dst, reg, dst_ty)); + } else { + // Case 2: when moving a scalar value of any other type, use MOVD to zero + // the upper lanes. + let src_size = match src_ty.bits() { + 32 => OperandSize::Size32, + 64 => OperandSize::Size64, + _ => unimplemented!("invalid source size for type: {}", src_ty), + }; + ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movd, src, src_size, dst)); + } + } + RegMem::Mem { .. } => { + // Case 3: when presented with `load + scalar_to_vector`, coalesce into a single + // MOVSS/MOVSD instruction. + let opcode = match src_ty.bits() { + 32 => SseOpcode::Movss, + 64 => SseOpcode::Movsd, + _ => unimplemented!("unable to move scalar to vector for type: {}", src_ty), + }; + ctx.emit(Inst::xmm_mov(opcode, src, dst)); + } + } + } + Opcode::Splat => { let ty = ty.unwrap(); assert_eq!(ty.bits(), 128); diff --git a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif index f44dbd3b62..f451bd2a25 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif @@ -91,3 +91,34 @@ block0(v0: f64): ; check: uninit %xmm1 ; nextln: movsd %xmm0, %xmm1 ; nextln: movlhps %xmm0, %xmm1 + + + +;; load*_zero + +; Verify that a `load` followed by a `scalar_to_vector` (the CLIF translation of `load32_zero`) is +; lowered to a single MOVSS instruction. +function %load32_zero_coalesced(i64) -> i32x4 { +block0(v0: i64): + v1 = load.i32 v0 + v2 = scalar_to_vector.i32x4 v1 + ; check: movss 0(%rdi), %xmm0 + return v2 +} + +;; Verify that `scalar_to_vector` (used by `load32_zero`), lowers as expected. +function %load32_zero_int(i32) -> i32x4 { +block0(v0: i32): + v1 = scalar_to_vector.i32x4 v0 + ; check: movd %edi, %xmm0 + return v1 +} +function %load32_zero_float(f32) -> f32x4 { +block0(v0: f32): + v1 = scalar_to_vector.f32x4 v0 + ; regex: MOV=movap* + ; check: pushq + ; not: $MOV + ; check: ret + return v1 +}