x64: Lower vany_true, vall_true, vhigh_bits, iconcat, and isplit in ISLE (#4787)

Lower vany_true, vall_true, vhigh_bits, iconcat, and isplit in ISLE.
2022-08-26 09:07:22 -07:00
parent 05ffdc26ec
commit c1f9736938
10 changed files with 210 additions and 193 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1521,6 +1521,13 @@

 ;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+;; Turn a vector type into its integer-typed vector equivalent.
+(decl vec_int_type (Type) Type)
+(rule (vec_int_type (multi_lane 8 16)) $I8X16)
+(rule (vec_int_type (multi_lane 16 8)) $I16X8)
+(rule (vec_int_type (multi_lane 32 4)) $I32X4)
+(rule (vec_int_type (multi_lane 64 2)) $I64X2)
+
 ;; Determine the appropriate operation for xor-ing vectors of the specified type
 (decl sse_xor_op (Type) SseOpcode)
 (rule (sse_xor_op $F32X4) (SseOpcode.Xorps))
@@ -2021,6 +2028,11 @@
 (rule (x64_test size src1 src2)
      (cmp_rmi_r size (CmpOpcode.Test) src1 src2))

+;; Helper for creating `ptest` instructions.
+(decl x64_ptest (XmmMem Xmm) ProducesFlags)
+(rule (x64_ptest src1 src2)
+      (xmm_cmp_rm_r (SseOpcode.Ptest) src1 src2))
+
 ;; Helper for creating `cmove` instructions. Note that these instructions do not
 ;; always result in a single emitted x86 instruction; e.g., XmmCmove uses jumps
 ;; to conditionally move the selected value into an XMM register.
@@ -2889,6 +2901,21 @@
            (_ Unit (emit (MInst.XmmToGpr op src dst size))))
        dst))

+;; Helper for creating `pmovmskb` instructions.
+(decl x64_pmovmskb (OperandSize Xmm) Gpr)
+(rule (x64_pmovmskb size src)
+      (xmm_to_gpr (SseOpcode.Pmovmskb) src size))
+
+;; Helper for creating `movmskps` instructions.
+(decl x64_movmskps (OperandSize Xmm) Gpr)
+(rule (x64_movmskps size src)
+      (xmm_to_gpr (SseOpcode.Movmskps) src size))
+
+;; Helper for creating `movmskpd` instructions.
+(decl x64_movmskpd (OperandSize Xmm) Gpr)
+(rule (x64_movmskpd size src)
+      (xmm_to_gpr (SseOpcode.Movmskpd) src size))
+
 ;; Helper for creating `MInst.GprToXmm` instructions.
 (decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm)
 (rule (gpr_to_xmm op src size)
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -89,6 +89,12 @@ impl Inst {
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
+
+    fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
+        debug_assert!(dst.to_reg().class() == RegClass::Int);
+        let dst = WritableGpr::from_writable_reg(dst).unwrap();
+        Inst::Setcc { cc, dst }
+    }
 }

 #[test]
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -478,12 +478,6 @@ impl Inst {
        Inst::Ud2 { trap_code }
    }

-    pub(crate) fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
-        debug_assert!(dst.to_reg().class() == RegClass::Int);
-        let dst = WritableGpr::from_writable_reg(dst).unwrap();
-        Inst::Setcc { cc, dst }
-    }
-
    pub(crate) fn cmove(size: OperandSize, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst {
        debug_assert!(size.is_one_of(&[
            OperandSize::Size16,
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3643,3 +3643,61 @@
            (src RegMem (RegMem.Reg src))
            (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
        (vec_insert_lane ty vec src 1)))
+
+;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (vany_true val))
+      (with_flags (x64_ptest val val) (x64_setcc (CC.NZ))))
+
+;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (vall_true val @ (value_type ty)))
+      (let ((src Xmm val)
+            (zeros Xmm (x64_pxor src src))
+            (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
+        (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))
+
+;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The Intel specification allows using both 32-bit and 64-bit GPRs as
+;; destination for the "move mask" instructions. This is controlled by the REX.R
+;; bit: "In 64-bit mode, the instruction can access additional registers when
+;; used with a REX.R prefix. The default operand size is 64-bit in 64-bit mode"
+;; (PMOVMSKB in IA Software Development Manual, vol. 2). This being the case, we
+;; will always clear REX.W since its use is unnecessary (`OperandSize` is used
+;; for setting/clearing REX.W) as we need at most 16 bits of output for
+;; `vhigh_bits`.
+
+(rule (lower (vhigh_bits val @ (value_type (multi_lane 8 16))))
+      (x64_pmovmskb (OperandSize.Size32) val))
+
+(rule (lower (vhigh_bits val @ (value_type (multi_lane 32 4))))
+      (x64_movmskps (OperandSize.Size32) val))
+
+(rule (lower (vhigh_bits val @ (value_type (multi_lane 64 2))))
+      (x64_movmskpd (OperandSize.Size32) val))
+
+;; There is no x86 instruction for extracting the high bit of 16-bit lanes so
+;; here we:
+;; - duplicate the 16-bit lanes of `src` into 8-bit lanes:
+;;     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
+;; - use PMOVMSKB to gather the high bits; now we have duplicates, though
+;; - shift away the bottom 8 high bits to remove the duplicates.
+(rule (lower (vhigh_bits val @ (value_type (multi_lane 16 8))))
+      (let ((src Xmm val)
+            (tmp Xmm (x64_packsswb src src))
+            (tmp Gpr (x64_pmovmskb (OperandSize.Size32) tmp)))
+        (x64_shr $I64 tmp (Imm8Reg.Imm8 8))))
+
+;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (iconcat lo @ (value_type $I64) hi))
+      (value_regs lo hi))
+
+;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (isplit val @ (value_type $I128)))
+      (let ((regs ValueRegs val)
+            (lo Reg (value_regs_get regs 0))
+            (hi Reg (value_regs_get regs 1)))
+        (output_pair lo hi)))
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -129,32 +129,6 @@ fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInp
    }
 }

-/// Put the given input into a register or a memory operand.
-/// Effectful: may mark the given input as used, when returning the register form.
-fn input_to_reg_mem(ctx: &mut Lower<Inst>, spec: InsnInput) -> RegMem {
-    let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);
-
-    if let Some(c) = inputs.constant {
-        // Generate constants fresh at each use to minimize long-range register pressure.
-        let ty = ctx.input_ty(spec.insn, spec.input);
-        return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
-    }
-
-    if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
-        if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
-            ctx.sink_inst(src_insn);
-            let amode = lower_to_amode(ctx, addr_input, offset);
-            return RegMem::mem(amode);
-        }
-    }
-
-    RegMem::reg(
-        ctx.put_input_in_regs(spec.insn, spec.input)
-            .only_reg()
-            .unwrap(),
-    )
-}
-
 fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
    ctx.get_input_as_source_or_const(spec.insn, spec.input)
        .constant
@@ -495,136 +469,17 @@ fn lower_insn_to_regs(
        | Opcode::Swizzle
        | Opcode::Extractlane
        | Opcode::ScalarToVector
-        | Opcode::Splat => {
+        | Opcode::Splat
+        | Opcode::VanyTrue
+        | Opcode::VallTrue
+        | Opcode::VhighBits
+        | Opcode::Iconcat
+        | Opcode::Isplit => {
            implemented_in_isle(ctx);
        }

        Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),

-        Opcode::VanyTrue => {
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let src_ty = ctx.input_ty(insn, 0);
-            assert_eq!(src_ty.bits(), 128);
-            let src = put_input_in_reg(ctx, inputs[0]);
-            // Set the ZF if the result is all zeroes.
-            ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
-            // If the ZF is not set, place a 1 in `dst`.
-            ctx.emit(Inst::setcc(CC::NZ, dst));
-        }
-
-        Opcode::VallTrue => {
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let src_ty = ctx.input_ty(insn, 0);
-            assert_eq!(src_ty.bits(), 128);
-            let src = input_to_reg_mem(ctx, inputs[0]);
-
-            let eq = |ty: Type| match ty.lane_bits() {
-                8 => SseOpcode::Pcmpeqb,
-                16 => SseOpcode::Pcmpeqw,
-                32 => SseOpcode::Pcmpeqd,
-                64 => SseOpcode::Pcmpeqq,
-                _ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
-            };
-
-            // Initialize a register with all 0s.
-            let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
-            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
-            // Compare to see what lanes are filled with all 1s.
-            ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
-            // Set the ZF if the result is all zeroes.
-            ctx.emit(Inst::xmm_cmp_rm_r(
-                SseOpcode::Ptest,
-                RegMem::from(tmp),
-                tmp.to_reg(),
-            ));
-            // If the ZF is set, place a 1 in `dst`.
-            ctx.emit(Inst::setcc(CC::Z, dst));
-        }
-
-        Opcode::VhighBits => {
-            let src = put_input_in_reg(ctx, inputs[0]);
-            let src_ty = ctx.input_ty(insn, 0);
-            debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            debug_assert!(dst.to_reg().class() == RegClass::Int);
-
-            // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
-            // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
-            // the instruction can access additional registers when used with a REX.R prefix. The
-            // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
-            // Manual, vol. 2). This being the case, we will always clear REX.W since its use is
-            // unnecessary (`OperandSize` is used for setting/clearing REX.W).
-            let size = OperandSize::Size32;
-
-            match src_ty {
-                types::I8X16 | types::B8X16 => {
-                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
-                }
-                types::I32X4 | types::B32X4 | types::F32X4 => {
-                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
-                }
-                types::I64X2 | types::B64X2 | types::F64X2 => {
-                    ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
-                }
-                types::I16X8 | types::B16X8 => {
-                    // There is no x86 instruction for extracting the high bit of 16-bit lanes so
-                    // here we:
-                    // - duplicate the 16-bit lanes of `src` into 8-bit lanes:
-                    //     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
-                    // - use PMOVMSKB to gather the high bits; now we have duplicates, though
-                    // - shift away the bottom 8 high bits to remove the duplicates.
-                    let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
-                    ctx.emit(Inst::gen_move(tmp, src, src_ty));
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
-                    ctx.emit(Inst::xmm_to_gpr(
-                        SseOpcode::Pmovmskb,
-                        tmp.to_reg(),
-                        dst,
-                        size,
-                    ));
-                    ctx.emit(Inst::shift_r(
-                        OperandSize::Size64,
-                        ShiftKind::ShiftRightLogical,
-                        Some(8),
-                        dst,
-                    ));
-                }
-                _ => unimplemented!("unknown input type {} for {}", src_ty, op),
-            }
-        }
-
-        Opcode::Iconcat => {
-            let ty = ctx.output_ty(insn, 0);
-            assert_eq!(
-                ty,
-                types::I128,
-                "Iconcat not expected to be used for non-128-bit type"
-            );
-            assert_eq!(ctx.input_ty(insn, 0), types::I64);
-            assert_eq!(ctx.input_ty(insn, 1), types::I64);
-            let lo = put_input_in_reg(ctx, inputs[0]);
-            let hi = put_input_in_reg(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]);
-            ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64));
-            ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64));
-        }
-
-        Opcode::Isplit => {
-            let ty = ctx.input_ty(insn, 0);
-            assert_eq!(
-                ty,
-                types::I128,
-                "Isplit not expected to be used for non-128-bit type"
-            );
-            assert_eq!(ctx.output_ty(insn, 0), types::I64);
-            assert_eq!(ctx.output_ty(insn, 1), types::I64);
-            let src = put_input_in_regs(ctx, inputs[0]);
-            let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
-            ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64));
-            ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64));
-        }
-
        Opcode::TlsValue => {
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let (name, _, _) = ctx.symbol_value(insn).unwrap();
--- a/cranelift/filetests/filetests/isa/x64/ishl.clif
+++ b/cranelift/filetests/filetests/isa/x64/ishl.clif
@@ -17,20 +17,20 @@ block0(v0: i128, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movzbq  %dl, %rax
-;   movq    %rax, %rcx
+;   movzbq  %dl, %rcx
 ;   movq    %rdi, %rdx
 ;   shlq    %cl, %rdx, %rdx
 ;   shlq    %cl, %rsi, %rsi
-;   movq    %rcx, %r8
+;   movq    %rcx, %rax
 ;   movl    $64, %ecx
-;   subq    %rcx, %r8, %rcx
+;   movq    %rax, %r10
+;   subq    %rcx, %r10, %rcx
 ;   shrq    %cl, %rdi, %rdi
 ;   xorq    %rax, %rax, %rax
-;   testq   $127, %r8
+;   testq   $127, %r10
 ;   cmovzq  %rax, %rdi, %rdi
 ;   orq     %rdi, %rsi, %rdi
-;   testq   $64, %r8
+;   testq   $64, %r10
 ;   cmovzq  %rdx, %rax, %rax
 ;   cmovzq  %rdi, %rdx, %rdx
 ;   movq    %rbp, %rsp
--- a/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif
@@ -41,9 +41,9 @@ block0(v0: i64x2):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   pxor    %xmm4, %xmm4, %xmm4
-;   pcmpeqq %xmm4, %xmm0, %xmm4
-;   ptest   %xmm4, %xmm4
+;   pxor    %xmm3, %xmm3, %xmm3
+;   pcmpeqq %xmm0, %xmm3, %xmm0
+;   ptest   %xmm0, %xmm0
 ;   setz    %al
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
--- a/cranelift/filetests/filetests/isa/x64/sshr.clif
+++ b/cranelift/filetests/filetests/isa/x64/sshr.clif
@@ -16,24 +16,25 @@ block0(v0: i128, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movzbq  %dl, %rdx
-;   movq    %rdx, %rcx
+;   movzbq  %dl, %rcx
 ;   shrq    %cl, %rdi, %rdi
-;   movq    %rsi, %r9
-;   sarq    %cl, %r9, %r9
+;   movq    %rsi, %rdx
+;   sarq    %cl, %rdx, %rdx
+;   movq    %rcx, %rax
 ;   movl    $64, %ecx
-;   subq    %rcx, %rdx, %rcx
-;   movq    %rsi, %r8
-;   shlq    %cl, %r8, %r8
-;   xorq    %r10, %r10, %r10
-;   testq   $127, %rdx
-;   cmovzq  %r10, %r8, %r8
-;   orq     %rdi, %r8, %rdi
+;   movq    %rax, %r11
+;   subq    %rcx, %r11, %rcx
+;   movq    %rsi, %rax
+;   shlq    %cl, %rax, %rax
+;   xorq    %r8, %r8, %r8
+;   testq   $127, %r11
+;   cmovzq  %r8, %rax, %rax
+;   orq     %rdi, %rax, %rdi
 ;   sarq    $63, %rsi, %rsi
-;   testq   $64, %rdx
-;   movq    %r9, %rax
+;   testq   $64, %r11
+;   movq    %rdx, %rax
 ;   cmovzq  %rdi, %rax, %rax
-;   cmovzq  %r9, %rsi, %rsi
+;   cmovzq  %rdx, %rsi, %rsi
 ;   movq    %rsi, %rdx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
--- a/cranelift/filetests/filetests/isa/x64/ushr.clif
+++ b/cranelift/filetests/filetests/isa/x64/ushr.clif
@@ -15,24 +15,24 @@ block0(v0: i128, v1: i8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movzbq  %dl, %rdx
-;   movq    %rdx, %rcx
+;   movzbq  %dl, %rcx
 ;   shrq    %cl, %rdi, %rdi
-;   movq    %rsi, %r9
-;   shrq    %cl, %r9, %r9
+;   movq    %rsi, %r8
+;   shrq    %cl, %r8, %r8
+;   movq    %rcx, %rax
 ;   movl    $64, %ecx
-;   movq    %rdx, %r10
-;   subq    %rcx, %r10, %rcx
+;   movq    %rax, %r11
+;   subq    %rcx, %r11, %rcx
 ;   shlq    %cl, %rsi, %rsi
-;   xorq    %r8, %r8, %r8
-;   testq   $127, %r10
-;   cmovzq  %r8, %rsi, %rsi
+;   xorq    %rax, %rax, %rax
+;   testq   $127, %r11
+;   cmovzq  %rax, %rsi, %rsi
 ;   orq     %rsi, %rdi, %rsi
 ;   xorq    %rdx, %rdx, %rdx
-;   testq   $64, %r10
-;   movq    %r9, %rax
+;   testq   $64, %r11
+;   movq    %r8, %rax
 ;   cmovzq  %rsi, %rax, %rax
-;   cmovzq  %r9, %rdx, %rdx
+;   cmovzq  %r8, %rdx, %rdx
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -188,7 +188,8 @@ block0(v0: i32, v1: i64, v2: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movq    %rsi, %rcx
+;   movq    %rsi, %r9
+;   movq    %r9, %rcx
 ;   shrl    %cl, %edi, %edi
 ;   movq    %rdi, %rax
 ;   movq    %rbp, %rsp
--- a/cranelift/filetests/filetests/isa/x64/vhigh_bits.clif
+++ b/cranelift/filetests/filetests/isa/x64/vhigh_bits.clif
@@ -0,0 +1,75 @@
+test compile precise-output
+target x86_64
+
+function %f1(i8x16) -> i8 {
+block0(v0: i8x16):
+  v1 = vhigh_bits.i8 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovmskb %xmm0, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i8x16) -> i16 {
+block0(v0: i8x16):
+  v1 = vhigh_bits.i16 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   pmovmskb %xmm0, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i16x8) -> i8 {
+block0(v0: i16x8):
+  v1 = vhigh_bits.i8 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   packsswb %xmm0, %xmm0, %xmm0
+;   pmovmskb %xmm0, %eax
+;   shrq    $8, %rax, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(i32x4) -> i8 {
+block0(v0: i32x4):
+  v1 = vhigh_bits.i8 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movmskps %xmm0, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f5(i64x2) -> i8 {
+block0(v0: i64x2):
+  v1 = vhigh_bits.i8 v0
+  return v1
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movmskpd %xmm0, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+