diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 0f491c42e1..a2a8e1a862 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1521,6 +1521,13 @@ ;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Turn a vector type into its integer-typed vector equivalent. +(decl vec_int_type (Type) Type) +(rule (vec_int_type (multi_lane 8 16)) $I8X16) +(rule (vec_int_type (multi_lane 16 8)) $I16X8) +(rule (vec_int_type (multi_lane 32 4)) $I32X4) +(rule (vec_int_type (multi_lane 64 2)) $I64X2) + ;; Determine the appropriate operation for xor-ing vectors of the specified type (decl sse_xor_op (Type) SseOpcode) (rule (sse_xor_op $F32X4) (SseOpcode.Xorps)) @@ -2021,6 +2028,11 @@ (rule (x64_test size src1 src2) (cmp_rmi_r size (CmpOpcode.Test) src1 src2)) +;; Helper for creating `ptest` instructions. +(decl x64_ptest (XmmMem Xmm) ProducesFlags) +(rule (x64_ptest src1 src2) + (xmm_cmp_rm_r (SseOpcode.Ptest) src1 src2)) + ;; Helper for creating `cmove` instructions. Note that these instructions do not ;; always result in a single emitted x86 instruction; e.g., XmmCmove uses jumps ;; to conditionally move the selected value into an XMM register. @@ -2889,6 +2901,21 @@ (_ Unit (emit (MInst.XmmToGpr op src dst size)))) dst)) +;; Helper for creating `pmovmskb` instructions. +(decl x64_pmovmskb (OperandSize Xmm) Gpr) +(rule (x64_pmovmskb size src) + (xmm_to_gpr (SseOpcode.Pmovmskb) src size)) + +;; Helper for creating `movmskps` instructions. +(decl x64_movmskps (OperandSize Xmm) Gpr) +(rule (x64_movmskps size src) + (xmm_to_gpr (SseOpcode.Movmskps) src size)) + +;; Helper for creating `movmskpd` instructions. +(decl x64_movmskpd (OperandSize Xmm) Gpr) +(rule (x64_movmskpd size src) + (xmm_to_gpr (SseOpcode.Movmskpd) src size)) + ;; Helper for creating `MInst.GprToXmm` instructions. (decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm) (rule (gpr_to_xmm op src size) diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 26b897d3b4..f848e0f9cf 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -89,6 +89,12 @@ impl Inst { dst: WritableXmm::from_writable_reg(dst).unwrap(), } } + + fn setcc(cc: CC, dst: Writable) -> Inst { + debug_assert!(dst.to_reg().class() == RegClass::Int); + let dst = WritableGpr::from_writable_reg(dst).unwrap(); + Inst::Setcc { cc, dst } + } } #[test] diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 4278cb192a..950c635a94 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -478,12 +478,6 @@ impl Inst { Inst::Ud2 { trap_code } } - pub(crate) fn setcc(cc: CC, dst: Writable) -> Inst { - debug_assert!(dst.to_reg().class() == RegClass::Int); - let dst = WritableGpr::from_writable_reg(dst).unwrap(); - Inst::Setcc { cc, dst } - } - pub(crate) fn cmove(size: OperandSize, cc: CC, src: RegMem, dst: Writable) -> Inst { debug_assert!(size.is_one_of(&[ OperandSize::Size16, diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 3e863633c0..0cdb6fe165 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3643,3 +3643,61 @@ (src RegMem (RegMem.Reg src)) (vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0))) (vec_insert_lane ty vec src 1))) + +;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (vany_true val)) + (with_flags (x64_ptest val val) (x64_setcc (CC.NZ)))) + +;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (vall_true val @ (value_type ty))) + (let ((src Xmm val) + (zeros Xmm (x64_pxor src src)) + (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros))) + (with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z))))) + +;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The Intel specification allows using both 32-bit and 64-bit GPRs as +;; destination for the "move mask" instructions. This is controlled by the REX.R +;; bit: "In 64-bit mode, the instruction can access additional registers when +;; used with a REX.R prefix. The default operand size is 64-bit in 64-bit mode" +;; (PMOVMSKB in IA Software Development Manual, vol. 2). This being the case, we +;; will always clear REX.W since its use is unnecessary (`OperandSize` is used +;; for setting/clearing REX.W) as we need at most 16 bits of output for +;; `vhigh_bits`. + +(rule (lower (vhigh_bits val @ (value_type (multi_lane 8 16)))) + (x64_pmovmskb (OperandSize.Size32) val)) + +(rule (lower (vhigh_bits val @ (value_type (multi_lane 32 4)))) + (x64_movmskps (OperandSize.Size32) val)) + +(rule (lower (vhigh_bits val @ (value_type (multi_lane 64 2)))) + (x64_movmskpd (OperandSize.Size32) val)) + +;; There is no x86 instruction for extracting the high bit of 16-bit lanes so +;; here we: +;; - duplicate the 16-bit lanes of `src` into 8-bit lanes: +;; PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...] +;; - use PMOVMSKB to gather the high bits; now we have duplicates, though +;; - shift away the bottom 8 high bits to remove the duplicates. +(rule (lower (vhigh_bits val @ (value_type (multi_lane 16 8)))) + (let ((src Xmm val) + (tmp Xmm (x64_packsswb src src)) + (tmp Gpr (x64_pmovmskb (OperandSize.Size32) tmp))) + (x64_shr $I64 tmp (Imm8Reg.Imm8 8)))) + +;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (iconcat lo @ (value_type $I64) hi)) + (value_regs lo hi)) + +;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (isplit val @ (value_type $I128))) + (let ((regs ValueRegs val) + (lo Reg (value_regs_get regs 0)) + (hi Reg (value_regs_get regs 1))) + (output_pair lo hi))) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 20bd49356a..f646f51ab3 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -129,32 +129,6 @@ fn is_mergeable_load(ctx: &mut Lower, src_insn: IRInst) -> Option<(InsnInp } } -/// Put the given input into a register or a memory operand. -/// Effectful: may mark the given input as used, when returning the register form. -fn input_to_reg_mem(ctx: &mut Lower, spec: InsnInput) -> RegMem { - let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input); - - if let Some(c) = inputs.constant { - // Generate constants fresh at each use to minimize long-range register pressure. - let ty = ctx.input_ty(spec.insn, spec.input); - return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap()); - } - - if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst { - if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) { - ctx.sink_inst(src_insn); - let amode = lower_to_amode(ctx, addr_input, offset); - return RegMem::mem(amode); - } - } - - RegMem::reg( - ctx.put_input_in_regs(spec.insn, spec.input) - .only_reg() - .unwrap(), - ) -} - fn input_to_imm(ctx: &mut Lower, spec: InsnInput) -> Option { ctx.get_input_as_source_or_const(spec.insn, spec.input) .constant @@ -495,136 +469,17 @@ fn lower_insn_to_regs( | Opcode::Swizzle | Opcode::Extractlane | Opcode::ScalarToVector - | Opcode::Splat => { + | Opcode::Splat + | Opcode::VanyTrue + | Opcode::VallTrue + | Opcode::VhighBits + | Opcode::Iconcat + | Opcode::Isplit => { implemented_in_isle(ctx); } Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"), - Opcode::VanyTrue => { - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let src_ty = ctx.input_ty(insn, 0); - assert_eq!(src_ty.bits(), 128); - let src = put_input_in_reg(ctx, inputs[0]); - // Set the ZF if the result is all zeroes. - ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src)); - // If the ZF is not set, place a 1 in `dst`. - ctx.emit(Inst::setcc(CC::NZ, dst)); - } - - Opcode::VallTrue => { - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let src_ty = ctx.input_ty(insn, 0); - assert_eq!(src_ty.bits(), 128); - let src = input_to_reg_mem(ctx, inputs[0]); - - let eq = |ty: Type| match ty.lane_bits() { - 8 => SseOpcode::Pcmpeqb, - 16 => SseOpcode::Pcmpeqw, - 32 => SseOpcode::Pcmpeqd, - 64 => SseOpcode::Pcmpeqq, - _ => panic!("Unable to find an instruction for {} for type: {}", op, ty), - }; - - // Initialize a register with all 0s. - let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap(); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); - // Compare to see what lanes are filled with all 1s. - ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp)); - // Set the ZF if the result is all zeroes. - ctx.emit(Inst::xmm_cmp_rm_r( - SseOpcode::Ptest, - RegMem::from(tmp), - tmp.to_reg(), - )); - // If the ZF is set, place a 1 in `dst`. - ctx.emit(Inst::setcc(CC::Z, dst)); - } - - Opcode::VhighBits => { - let src = put_input_in_reg(ctx, inputs[0]); - let src_ty = ctx.input_ty(insn, 0); - debug_assert!(src_ty.is_vector() && src_ty.bits() == 128); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - debug_assert!(dst.to_reg().class() == RegClass::Int); - - // The Intel specification allows using both 32-bit and 64-bit GPRs as destination for - // the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode, - // the instruction can access additional registers when used with a REX.R prefix. The - // default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development - // Manual, vol. 2). This being the case, we will always clear REX.W since its use is - // unnecessary (`OperandSize` is used for setting/clearing REX.W). - let size = OperandSize::Size32; - - match src_ty { - types::I8X16 | types::B8X16 => { - ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size)) - } - types::I32X4 | types::B32X4 | types::F32X4 => { - ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size)) - } - types::I64X2 | types::B64X2 | types::F64X2 => { - ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size)) - } - types::I16X8 | types::B16X8 => { - // There is no x86 instruction for extracting the high bit of 16-bit lanes so - // here we: - // - duplicate the 16-bit lanes of `src` into 8-bit lanes: - // PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...] - // - use PMOVMSKB to gather the high bits; now we have duplicates, though - // - shift away the bottom 8 high bits to remove the duplicates. - let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap(); - ctx.emit(Inst::gen_move(tmp, src, src_ty)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp)); - ctx.emit(Inst::xmm_to_gpr( - SseOpcode::Pmovmskb, - tmp.to_reg(), - dst, - size, - )); - ctx.emit(Inst::shift_r( - OperandSize::Size64, - ShiftKind::ShiftRightLogical, - Some(8), - dst, - )); - } - _ => unimplemented!("unknown input type {} for {}", src_ty, op), - } - } - - Opcode::Iconcat => { - let ty = ctx.output_ty(insn, 0); - assert_eq!( - ty, - types::I128, - "Iconcat not expected to be used for non-128-bit type" - ); - assert_eq!(ctx.input_ty(insn, 0), types::I64); - assert_eq!(ctx.input_ty(insn, 1), types::I64); - let lo = put_input_in_reg(ctx, inputs[0]); - let hi = put_input_in_reg(ctx, inputs[1]); - let dst = get_output_reg(ctx, outputs[0]); - ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64)); - ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64)); - } - - Opcode::Isplit => { - let ty = ctx.input_ty(insn, 0); - assert_eq!( - ty, - types::I128, - "Isplit not expected to be used for non-128-bit type" - ); - assert_eq!(ctx.output_ty(insn, 0), types::I64); - assert_eq!(ctx.output_ty(insn, 1), types::I64); - let src = put_input_in_regs(ctx, inputs[0]); - let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap(); - ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64)); - ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64)); - } - Opcode::TlsValue => { let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let (name, _, _) = ctx.symbol_value(insn).unwrap(); diff --git a/cranelift/filetests/filetests/isa/x64/ishl.clif b/cranelift/filetests/filetests/isa/x64/ishl.clif index 4577d7c024..a544d260fc 100644 --- a/cranelift/filetests/filetests/isa/x64/ishl.clif +++ b/cranelift/filetests/filetests/isa/x64/ishl.clif @@ -17,20 +17,20 @@ block0(v0: i128, v1: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movzbq %dl, %rax -; movq %rax, %rcx +; movzbq %dl, %rcx ; movq %rdi, %rdx ; shlq %cl, %rdx, %rdx ; shlq %cl, %rsi, %rsi -; movq %rcx, %r8 +; movq %rcx, %rax ; movl $64, %ecx -; subq %rcx, %r8, %rcx +; movq %rax, %r10 +; subq %rcx, %r10, %rcx ; shrq %cl, %rdi, %rdi ; xorq %rax, %rax, %rax -; testq $127, %r8 +; testq $127, %r10 ; cmovzq %rax, %rdi, %rdi ; orq %rdi, %rsi, %rdi -; testq $64, %r8 +; testq $64, %r10 ; cmovzq %rdx, %rax, %rax ; cmovzq %rdi, %rdx, %rdx ; movq %rbp, %rsp diff --git a/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif index cad0c91c0d..2be24e4b18 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-logical-compile.clif @@ -41,9 +41,9 @@ block0(v0: i64x2): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pxor %xmm4, %xmm4, %xmm4 -; pcmpeqq %xmm4, %xmm0, %xmm4 -; ptest %xmm4, %xmm4 +; pxor %xmm3, %xmm3, %xmm3 +; pcmpeqq %xmm0, %xmm3, %xmm0 +; ptest %xmm0, %xmm0 ; setz %al ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/sshr.clif b/cranelift/filetests/filetests/isa/x64/sshr.clif index 0c8f1dd44e..e03b6dae95 100644 --- a/cranelift/filetests/filetests/isa/x64/sshr.clif +++ b/cranelift/filetests/filetests/isa/x64/sshr.clif @@ -16,24 +16,25 @@ block0(v0: i128, v1: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movzbq %dl, %rdx -; movq %rdx, %rcx +; movzbq %dl, %rcx ; shrq %cl, %rdi, %rdi -; movq %rsi, %r9 -; sarq %cl, %r9, %r9 +; movq %rsi, %rdx +; sarq %cl, %rdx, %rdx +; movq %rcx, %rax ; movl $64, %ecx -; subq %rcx, %rdx, %rcx -; movq %rsi, %r8 -; shlq %cl, %r8, %r8 -; xorq %r10, %r10, %r10 -; testq $127, %rdx -; cmovzq %r10, %r8, %r8 -; orq %rdi, %r8, %rdi +; movq %rax, %r11 +; subq %rcx, %r11, %rcx +; movq %rsi, %rax +; shlq %cl, %rax, %rax +; xorq %r8, %r8, %r8 +; testq $127, %r11 +; cmovzq %r8, %rax, %rax +; orq %rdi, %rax, %rdi ; sarq $63, %rsi, %rsi -; testq $64, %rdx -; movq %r9, %rax +; testq $64, %r11 +; movq %rdx, %rax ; cmovzq %rdi, %rax, %rax -; cmovzq %r9, %rsi, %rsi +; cmovzq %rdx, %rsi, %rsi ; movq %rsi, %rdx ; movq %rbp, %rsp ; popq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/ushr.clif b/cranelift/filetests/filetests/isa/x64/ushr.clif index 8f7f7800ce..1371f2d23a 100644 --- a/cranelift/filetests/filetests/isa/x64/ushr.clif +++ b/cranelift/filetests/filetests/isa/x64/ushr.clif @@ -15,24 +15,24 @@ block0(v0: i128, v1: i8): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movzbq %dl, %rdx -; movq %rdx, %rcx +; movzbq %dl, %rcx ; shrq %cl, %rdi, %rdi -; movq %rsi, %r9 -; shrq %cl, %r9, %r9 +; movq %rsi, %r8 +; shrq %cl, %r8, %r8 +; movq %rcx, %rax ; movl $64, %ecx -; movq %rdx, %r10 -; subq %rcx, %r10, %rcx +; movq %rax, %r11 +; subq %rcx, %r11, %rcx ; shlq %cl, %rsi, %rsi -; xorq %r8, %r8, %r8 -; testq $127, %r10 -; cmovzq %r8, %rsi, %rsi +; xorq %rax, %rax, %rax +; testq $127, %r11 +; cmovzq %rax, %rsi, %rsi ; orq %rsi, %rdi, %rsi ; xorq %rdx, %rdx, %rdx -; testq $64, %r10 -; movq %r9, %rax +; testq $64, %r11 +; movq %r8, %rax ; cmovzq %rsi, %rax, %rax -; cmovzq %r9, %rdx, %rdx +; cmovzq %r8, %rdx, %rdx ; movq %rbp, %rsp ; popq %rbp ; ret @@ -188,7 +188,8 @@ block0(v0: i32, v1: i64, v2: i64): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; movq %rsi, %rcx +; movq %rsi, %r9 +; movq %r9, %rcx ; shrl %cl, %edi, %edi ; movq %rdi, %rax ; movq %rbp, %rsp diff --git a/cranelift/filetests/filetests/isa/x64/vhigh_bits.clif b/cranelift/filetests/filetests/isa/x64/vhigh_bits.clif new file mode 100644 index 0000000000..185cb62764 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/vhigh_bits.clif @@ -0,0 +1,75 @@ +test compile precise-output +target x86_64 + +function %f1(i8x16) -> i8 { +block0(v0: i8x16): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovmskb %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f2(i8x16) -> i16 { +block0(v0: i8x16): + v1 = vhigh_bits.i16 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pmovmskb %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f3(i16x8) -> i8 { +block0(v0: i16x8): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; packsswb %xmm0, %xmm0, %xmm0 +; pmovmskb %xmm0, %eax +; shrq $8, %rax, %rax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f4(i32x4) -> i8 { +block0(v0: i32x4): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movmskps %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f5(i64x2) -> i8 { +block0(v0: i64x2): + v1 = vhigh_bits.i8 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movmskpd %xmm0, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +