x64: Lower vany_true, vall_true, vhigh_bits, iconcat, and isplit in ISLE (#4787)
Lower vany_true, vall_true, vhigh_bits, iconcat, and isplit in ISLE.
This commit is contained in:
@@ -1521,6 +1521,13 @@
|
|||||||
|
|
||||||
;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; Turn a vector type into its integer-typed vector equivalent.
|
||||||
|
(decl vec_int_type (Type) Type)
|
||||||
|
(rule (vec_int_type (multi_lane 8 16)) $I8X16)
|
||||||
|
(rule (vec_int_type (multi_lane 16 8)) $I16X8)
|
||||||
|
(rule (vec_int_type (multi_lane 32 4)) $I32X4)
|
||||||
|
(rule (vec_int_type (multi_lane 64 2)) $I64X2)
|
||||||
|
|
||||||
;; Determine the appropriate operation for xor-ing vectors of the specified type
|
;; Determine the appropriate operation for xor-ing vectors of the specified type
|
||||||
(decl sse_xor_op (Type) SseOpcode)
|
(decl sse_xor_op (Type) SseOpcode)
|
||||||
(rule (sse_xor_op $F32X4) (SseOpcode.Xorps))
|
(rule (sse_xor_op $F32X4) (SseOpcode.Xorps))
|
||||||
@@ -2021,6 +2028,11 @@
|
|||||||
(rule (x64_test size src1 src2)
|
(rule (x64_test size src1 src2)
|
||||||
(cmp_rmi_r size (CmpOpcode.Test) src1 src2))
|
(cmp_rmi_r size (CmpOpcode.Test) src1 src2))
|
||||||
|
|
||||||
|
;; Helper for creating `ptest` instructions.
|
||||||
|
(decl x64_ptest (XmmMem Xmm) ProducesFlags)
|
||||||
|
(rule (x64_ptest src1 src2)
|
||||||
|
(xmm_cmp_rm_r (SseOpcode.Ptest) src1 src2))
|
||||||
|
|
||||||
;; Helper for creating `cmove` instructions. Note that these instructions do not
|
;; Helper for creating `cmove` instructions. Note that these instructions do not
|
||||||
;; always result in a single emitted x86 instruction; e.g., XmmCmove uses jumps
|
;; always result in a single emitted x86 instruction; e.g., XmmCmove uses jumps
|
||||||
;; to conditionally move the selected value into an XMM register.
|
;; to conditionally move the selected value into an XMM register.
|
||||||
@@ -2889,6 +2901,21 @@
|
|||||||
(_ Unit (emit (MInst.XmmToGpr op src dst size))))
|
(_ Unit (emit (MInst.XmmToGpr op src dst size))))
|
||||||
dst))
|
dst))
|
||||||
|
|
||||||
|
;; Helper for creating `pmovmskb` instructions.
|
||||||
|
(decl x64_pmovmskb (OperandSize Xmm) Gpr)
|
||||||
|
(rule (x64_pmovmskb size src)
|
||||||
|
(xmm_to_gpr (SseOpcode.Pmovmskb) src size))
|
||||||
|
|
||||||
|
;; Helper for creating `movmskps` instructions.
|
||||||
|
(decl x64_movmskps (OperandSize Xmm) Gpr)
|
||||||
|
(rule (x64_movmskps size src)
|
||||||
|
(xmm_to_gpr (SseOpcode.Movmskps) src size))
|
||||||
|
|
||||||
|
;; Helper for creating `movmskpd` instructions.
|
||||||
|
(decl x64_movmskpd (OperandSize Xmm) Gpr)
|
||||||
|
(rule (x64_movmskpd size src)
|
||||||
|
(xmm_to_gpr (SseOpcode.Movmskpd) src size))
|
||||||
|
|
||||||
;; Helper for creating `MInst.GprToXmm` instructions.
|
;; Helper for creating `MInst.GprToXmm` instructions.
|
||||||
(decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm)
|
(decl gpr_to_xmm (SseOpcode GprMem OperandSize) Xmm)
|
||||||
(rule (gpr_to_xmm op src size)
|
(rule (gpr_to_xmm op src size)
|
||||||
|
|||||||
@@ -89,6 +89,12 @@ impl Inst {
|
|||||||
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
dst: WritableXmm::from_writable_reg(dst).unwrap(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
|
||||||
|
debug_assert!(dst.to_reg().class() == RegClass::Int);
|
||||||
|
let dst = WritableGpr::from_writable_reg(dst).unwrap();
|
||||||
|
Inst::Setcc { cc, dst }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -478,12 +478,6 @@ impl Inst {
|
|||||||
Inst::Ud2 { trap_code }
|
Inst::Ud2 { trap_code }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn setcc(cc: CC, dst: Writable<Reg>) -> Inst {
|
|
||||||
debug_assert!(dst.to_reg().class() == RegClass::Int);
|
|
||||||
let dst = WritableGpr::from_writable_reg(dst).unwrap();
|
|
||||||
Inst::Setcc { cc, dst }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn cmove(size: OperandSize, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst {
|
pub(crate) fn cmove(size: OperandSize, cc: CC, src: RegMem, dst: Writable<Reg>) -> Inst {
|
||||||
debug_assert!(size.is_one_of(&[
|
debug_assert!(size.is_one_of(&[
|
||||||
OperandSize::Size16,
|
OperandSize::Size16,
|
||||||
|
|||||||
@@ -3643,3 +3643,61 @@
|
|||||||
(src RegMem (RegMem.Reg src))
|
(src RegMem (RegMem.Reg src))
|
||||||
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
|
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
|
||||||
(vec_insert_lane ty vec src 1)))
|
(vec_insert_lane ty vec src 1)))
|
||||||
|
|
||||||
|
;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(rule (lower (vany_true val))
|
||||||
|
(with_flags (x64_ptest val val) (x64_setcc (CC.NZ))))
|
||||||
|
|
||||||
|
;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(rule (lower (vall_true val @ (value_type ty)))
|
||||||
|
(let ((src Xmm val)
|
||||||
|
(zeros Xmm (x64_pxor src src))
|
||||||
|
(cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
|
||||||
|
(with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))
|
||||||
|
|
||||||
|
;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;; The Intel specification allows using both 32-bit and 64-bit GPRs as
|
||||||
|
;; destination for the "move mask" instructions. This is controlled by the REX.R
|
||||||
|
;; bit: "In 64-bit mode, the instruction can access additional registers when
|
||||||
|
;; used with a REX.R prefix. The default operand size is 64-bit in 64-bit mode"
|
||||||
|
;; (PMOVMSKB in IA Software Development Manual, vol. 2). This being the case, we
|
||||||
|
;; will always clear REX.W since its use is unnecessary (`OperandSize` is used
|
||||||
|
;; for setting/clearing REX.W) as we need at most 16 bits of output for
|
||||||
|
;; `vhigh_bits`.
|
||||||
|
|
||||||
|
(rule (lower (vhigh_bits val @ (value_type (multi_lane 8 16))))
|
||||||
|
(x64_pmovmskb (OperandSize.Size32) val))
|
||||||
|
|
||||||
|
(rule (lower (vhigh_bits val @ (value_type (multi_lane 32 4))))
|
||||||
|
(x64_movmskps (OperandSize.Size32) val))
|
||||||
|
|
||||||
|
(rule (lower (vhigh_bits val @ (value_type (multi_lane 64 2))))
|
||||||
|
(x64_movmskpd (OperandSize.Size32) val))
|
||||||
|
|
||||||
|
;; There is no x86 instruction for extracting the high bit of 16-bit lanes so
|
||||||
|
;; here we:
|
||||||
|
;; - duplicate the 16-bit lanes of `src` into 8-bit lanes:
|
||||||
|
;; PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
|
||||||
|
;; - use PMOVMSKB to gather the high bits; now we have duplicates, though
|
||||||
|
;; - shift away the bottom 8 high bits to remove the duplicates.
|
||||||
|
(rule (lower (vhigh_bits val @ (value_type (multi_lane 16 8))))
|
||||||
|
(let ((src Xmm val)
|
||||||
|
(tmp Xmm (x64_packsswb src src))
|
||||||
|
(tmp Gpr (x64_pmovmskb (OperandSize.Size32) tmp)))
|
||||||
|
(x64_shr $I64 tmp (Imm8Reg.Imm8 8))))
|
||||||
|
|
||||||
|
;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(rule (lower (iconcat lo @ (value_type $I64) hi))
|
||||||
|
(value_regs lo hi))
|
||||||
|
|
||||||
|
;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(rule (lower (isplit val @ (value_type $I128)))
|
||||||
|
(let ((regs ValueRegs val)
|
||||||
|
(lo Reg (value_regs_get regs 0))
|
||||||
|
(hi Reg (value_regs_get regs 1)))
|
||||||
|
(output_pair lo hi)))
|
||||||
|
|||||||
@@ -129,32 +129,6 @@ fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInp
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Put the given input into a register or a memory operand.
|
|
||||||
/// Effectful: may mark the given input as used, when returning the register form.
|
|
||||||
fn input_to_reg_mem(ctx: &mut Lower<Inst>, spec: InsnInput) -> RegMem {
|
|
||||||
let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);
|
|
||||||
|
|
||||||
if let Some(c) = inputs.constant {
|
|
||||||
// Generate constants fresh at each use to minimize long-range register pressure.
|
|
||||||
let ty = ctx.input_ty(spec.insn, spec.input);
|
|
||||||
return RegMem::reg(generate_constant(ctx, ty, c).only_reg().unwrap());
|
|
||||||
}
|
|
||||||
|
|
||||||
if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
|
|
||||||
if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
|
|
||||||
ctx.sink_inst(src_insn);
|
|
||||||
let amode = lower_to_amode(ctx, addr_input, offset);
|
|
||||||
return RegMem::mem(amode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
RegMem::reg(
|
|
||||||
ctx.put_input_in_regs(spec.insn, spec.input)
|
|
||||||
.only_reg()
|
|
||||||
.unwrap(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
|
fn input_to_imm(ctx: &mut Lower<Inst>, spec: InsnInput) -> Option<u64> {
|
||||||
ctx.get_input_as_source_or_const(spec.insn, spec.input)
|
ctx.get_input_as_source_or_const(spec.insn, spec.input)
|
||||||
.constant
|
.constant
|
||||||
@@ -495,136 +469,17 @@ fn lower_insn_to_regs(
|
|||||||
| Opcode::Swizzle
|
| Opcode::Swizzle
|
||||||
| Opcode::Extractlane
|
| Opcode::Extractlane
|
||||||
| Opcode::ScalarToVector
|
| Opcode::ScalarToVector
|
||||||
| Opcode::Splat => {
|
| Opcode::Splat
|
||||||
|
| Opcode::VanyTrue
|
||||||
|
| Opcode::VallTrue
|
||||||
|
| Opcode::VhighBits
|
||||||
|
| Opcode::Iconcat
|
||||||
|
| Opcode::Isplit => {
|
||||||
implemented_in_isle(ctx);
|
implemented_in_isle(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
|
Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
|
||||||
|
|
||||||
Opcode::VanyTrue => {
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
let src_ty = ctx.input_ty(insn, 0);
|
|
||||||
assert_eq!(src_ty.bits(), 128);
|
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
// Set the ZF if the result is all zeroes.
|
|
||||||
ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
|
|
||||||
// If the ZF is not set, place a 1 in `dst`.
|
|
||||||
ctx.emit(Inst::setcc(CC::NZ, dst));
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::VallTrue => {
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
let src_ty = ctx.input_ty(insn, 0);
|
|
||||||
assert_eq!(src_ty.bits(), 128);
|
|
||||||
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
||||||
|
|
||||||
let eq = |ty: Type| match ty.lane_bits() {
|
|
||||||
8 => SseOpcode::Pcmpeqb,
|
|
||||||
16 => SseOpcode::Pcmpeqw,
|
|
||||||
32 => SseOpcode::Pcmpeqd,
|
|
||||||
64 => SseOpcode::Pcmpeqq,
|
|
||||||
_ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Initialize a register with all 0s.
|
|
||||||
let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
|
|
||||||
// Compare to see what lanes are filled with all 1s.
|
|
||||||
ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
|
|
||||||
// Set the ZF if the result is all zeroes.
|
|
||||||
ctx.emit(Inst::xmm_cmp_rm_r(
|
|
||||||
SseOpcode::Ptest,
|
|
||||||
RegMem::from(tmp),
|
|
||||||
tmp.to_reg(),
|
|
||||||
));
|
|
||||||
// If the ZF is set, place a 1 in `dst`.
|
|
||||||
ctx.emit(Inst::setcc(CC::Z, dst));
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::VhighBits => {
|
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let src_ty = ctx.input_ty(insn, 0);
|
|
||||||
debug_assert!(src_ty.is_vector() && src_ty.bits() == 128);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
debug_assert!(dst.to_reg().class() == RegClass::Int);
|
|
||||||
|
|
||||||
// The Intel specification allows using both 32-bit and 64-bit GPRs as destination for
|
|
||||||
// the "move mask" instructions. This is controlled by the REX.R bit: "In 64-bit mode,
|
|
||||||
// the instruction can access additional registers when used with a REX.R prefix. The
|
|
||||||
// default operand size is 64-bit in 64-bit mode" (PMOVMSKB in IA Software Development
|
|
||||||
// Manual, vol. 2). This being the case, we will always clear REX.W since its use is
|
|
||||||
// unnecessary (`OperandSize` is used for setting/clearing REX.W).
|
|
||||||
let size = OperandSize::Size32;
|
|
||||||
|
|
||||||
match src_ty {
|
|
||||||
types::I8X16 | types::B8X16 => {
|
|
||||||
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Pmovmskb, src, dst, size))
|
|
||||||
}
|
|
||||||
types::I32X4 | types::B32X4 | types::F32X4 => {
|
|
||||||
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskps, src, dst, size))
|
|
||||||
}
|
|
||||||
types::I64X2 | types::B64X2 | types::F64X2 => {
|
|
||||||
ctx.emit(Inst::xmm_to_gpr(SseOpcode::Movmskpd, src, dst, size))
|
|
||||||
}
|
|
||||||
types::I16X8 | types::B16X8 => {
|
|
||||||
// There is no x86 instruction for extracting the high bit of 16-bit lanes so
|
|
||||||
// here we:
|
|
||||||
// - duplicate the 16-bit lanes of `src` into 8-bit lanes:
|
|
||||||
// PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
|
|
||||||
// - use PMOVMSKB to gather the high bits; now we have duplicates, though
|
|
||||||
// - shift away the bottom 8 high bits to remove the duplicates.
|
|
||||||
let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::gen_move(tmp, src, src_ty));
|
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Packsswb, RegMem::reg(src), tmp));
|
|
||||||
ctx.emit(Inst::xmm_to_gpr(
|
|
||||||
SseOpcode::Pmovmskb,
|
|
||||||
tmp.to_reg(),
|
|
||||||
dst,
|
|
||||||
size,
|
|
||||||
));
|
|
||||||
ctx.emit(Inst::shift_r(
|
|
||||||
OperandSize::Size64,
|
|
||||||
ShiftKind::ShiftRightLogical,
|
|
||||||
Some(8),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
_ => unimplemented!("unknown input type {} for {}", src_ty, op),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::Iconcat => {
|
|
||||||
let ty = ctx.output_ty(insn, 0);
|
|
||||||
assert_eq!(
|
|
||||||
ty,
|
|
||||||
types::I128,
|
|
||||||
"Iconcat not expected to be used for non-128-bit type"
|
|
||||||
);
|
|
||||||
assert_eq!(ctx.input_ty(insn, 0), types::I64);
|
|
||||||
assert_eq!(ctx.input_ty(insn, 1), types::I64);
|
|
||||||
let lo = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let hi = put_input_in_reg(ctx, inputs[1]);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]);
|
|
||||||
ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64));
|
|
||||||
ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64));
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::Isplit => {
|
|
||||||
let ty = ctx.input_ty(insn, 0);
|
|
||||||
assert_eq!(
|
|
||||||
ty,
|
|
||||||
types::I128,
|
|
||||||
"Isplit not expected to be used for non-128-bit type"
|
|
||||||
);
|
|
||||||
assert_eq!(ctx.output_ty(insn, 0), types::I64);
|
|
||||||
assert_eq!(ctx.output_ty(insn, 1), types::I64);
|
|
||||||
let src = put_input_in_regs(ctx, inputs[0]);
|
|
||||||
let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64));
|
|
||||||
ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64));
|
|
||||||
}
|
|
||||||
|
|
||||||
Opcode::TlsValue => {
|
Opcode::TlsValue => {
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
let (name, _, _) = ctx.symbol_value(insn).unwrap();
|
let (name, _, _) = ctx.symbol_value(insn).unwrap();
|
||||||
|
|||||||
@@ -17,20 +17,20 @@ block0(v0: i128, v1: i8):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movzbq %dl, %rax
|
; movzbq %dl, %rcx
|
||||||
; movq %rax, %rcx
|
|
||||||
; movq %rdi, %rdx
|
; movq %rdi, %rdx
|
||||||
; shlq %cl, %rdx, %rdx
|
; shlq %cl, %rdx, %rdx
|
||||||
; shlq %cl, %rsi, %rsi
|
; shlq %cl, %rsi, %rsi
|
||||||
; movq %rcx, %r8
|
; movq %rcx, %rax
|
||||||
; movl $64, %ecx
|
; movl $64, %ecx
|
||||||
; subq %rcx, %r8, %rcx
|
; movq %rax, %r10
|
||||||
|
; subq %rcx, %r10, %rcx
|
||||||
; shrq %cl, %rdi, %rdi
|
; shrq %cl, %rdi, %rdi
|
||||||
; xorq %rax, %rax, %rax
|
; xorq %rax, %rax, %rax
|
||||||
; testq $127, %r8
|
; testq $127, %r10
|
||||||
; cmovzq %rax, %rdi, %rdi
|
; cmovzq %rax, %rdi, %rdi
|
||||||
; orq %rdi, %rsi, %rdi
|
; orq %rdi, %rsi, %rdi
|
||||||
; testq $64, %r8
|
; testq $64, %r10
|
||||||
; cmovzq %rdx, %rax, %rax
|
; cmovzq %rdx, %rax, %rax
|
||||||
; cmovzq %rdi, %rdx, %rdx
|
; cmovzq %rdi, %rdx, %rdx
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
|
|||||||
@@ -41,9 +41,9 @@ block0(v0: i64x2):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; pxor %xmm4, %xmm4, %xmm4
|
; pxor %xmm3, %xmm3, %xmm3
|
||||||
; pcmpeqq %xmm4, %xmm0, %xmm4
|
; pcmpeqq %xmm0, %xmm3, %xmm0
|
||||||
; ptest %xmm4, %xmm4
|
; ptest %xmm0, %xmm0
|
||||||
; setz %al
|
; setz %al
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
|
|||||||
@@ -16,24 +16,25 @@ block0(v0: i128, v1: i8):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movzbq %dl, %rdx
|
; movzbq %dl, %rcx
|
||||||
; movq %rdx, %rcx
|
|
||||||
; shrq %cl, %rdi, %rdi
|
; shrq %cl, %rdi, %rdi
|
||||||
; movq %rsi, %r9
|
; movq %rsi, %rdx
|
||||||
; sarq %cl, %r9, %r9
|
; sarq %cl, %rdx, %rdx
|
||||||
|
; movq %rcx, %rax
|
||||||
; movl $64, %ecx
|
; movl $64, %ecx
|
||||||
; subq %rcx, %rdx, %rcx
|
; movq %rax, %r11
|
||||||
; movq %rsi, %r8
|
; subq %rcx, %r11, %rcx
|
||||||
; shlq %cl, %r8, %r8
|
; movq %rsi, %rax
|
||||||
; xorq %r10, %r10, %r10
|
; shlq %cl, %rax, %rax
|
||||||
; testq $127, %rdx
|
; xorq %r8, %r8, %r8
|
||||||
; cmovzq %r10, %r8, %r8
|
; testq $127, %r11
|
||||||
; orq %rdi, %r8, %rdi
|
; cmovzq %r8, %rax, %rax
|
||||||
|
; orq %rdi, %rax, %rdi
|
||||||
; sarq $63, %rsi, %rsi
|
; sarq $63, %rsi, %rsi
|
||||||
; testq $64, %rdx
|
; testq $64, %r11
|
||||||
; movq %r9, %rax
|
; movq %rdx, %rax
|
||||||
; cmovzq %rdi, %rax, %rax
|
; cmovzq %rdi, %rax, %rax
|
||||||
; cmovzq %r9, %rsi, %rsi
|
; cmovzq %rdx, %rsi, %rsi
|
||||||
; movq %rsi, %rdx
|
; movq %rsi, %rdx
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
|
|||||||
@@ -15,24 +15,24 @@ block0(v0: i128, v1: i8):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movzbq %dl, %rdx
|
; movzbq %dl, %rcx
|
||||||
; movq %rdx, %rcx
|
|
||||||
; shrq %cl, %rdi, %rdi
|
; shrq %cl, %rdi, %rdi
|
||||||
; movq %rsi, %r9
|
; movq %rsi, %r8
|
||||||
; shrq %cl, %r9, %r9
|
; shrq %cl, %r8, %r8
|
||||||
|
; movq %rcx, %rax
|
||||||
; movl $64, %ecx
|
; movl $64, %ecx
|
||||||
; movq %rdx, %r10
|
; movq %rax, %r11
|
||||||
; subq %rcx, %r10, %rcx
|
; subq %rcx, %r11, %rcx
|
||||||
; shlq %cl, %rsi, %rsi
|
; shlq %cl, %rsi, %rsi
|
||||||
; xorq %r8, %r8, %r8
|
; xorq %rax, %rax, %rax
|
||||||
; testq $127, %r10
|
; testq $127, %r11
|
||||||
; cmovzq %r8, %rsi, %rsi
|
; cmovzq %rax, %rsi, %rsi
|
||||||
; orq %rsi, %rdi, %rsi
|
; orq %rsi, %rdi, %rsi
|
||||||
; xorq %rdx, %rdx, %rdx
|
; xorq %rdx, %rdx, %rdx
|
||||||
; testq $64, %r10
|
; testq $64, %r11
|
||||||
; movq %r9, %rax
|
; movq %r8, %rax
|
||||||
; cmovzq %rsi, %rax, %rax
|
; cmovzq %rsi, %rax, %rax
|
||||||
; cmovzq %r9, %rdx, %rdx
|
; cmovzq %r8, %rdx, %rdx
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
; popq %rbp
|
; popq %rbp
|
||||||
; ret
|
; ret
|
||||||
@@ -188,7 +188,8 @@ block0(v0: i32, v1: i64, v2: i64):
|
|||||||
; pushq %rbp
|
; pushq %rbp
|
||||||
; movq %rsp, %rbp
|
; movq %rsp, %rbp
|
||||||
; block0:
|
; block0:
|
||||||
; movq %rsi, %rcx
|
; movq %rsi, %r9
|
||||||
|
; movq %r9, %rcx
|
||||||
; shrl %cl, %edi, %edi
|
; shrl %cl, %edi, %edi
|
||||||
; movq %rdi, %rax
|
; movq %rdi, %rax
|
||||||
; movq %rbp, %rsp
|
; movq %rbp, %rsp
|
||||||
|
|||||||
75
cranelift/filetests/filetests/isa/x64/vhigh_bits.clif
Normal file
75
cranelift/filetests/filetests/isa/x64/vhigh_bits.clif
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
test compile precise-output
|
||||||
|
target x86_64
|
||||||
|
|
||||||
|
function %f1(i8x16) -> i8 {
|
||||||
|
block0(v0: i8x16):
|
||||||
|
v1 = vhigh_bits.i8 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pmovmskb %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f2(i8x16) -> i16 {
|
||||||
|
block0(v0: i8x16):
|
||||||
|
v1 = vhigh_bits.i16 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; pmovmskb %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f3(i16x8) -> i8 {
|
||||||
|
block0(v0: i16x8):
|
||||||
|
v1 = vhigh_bits.i8 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; packsswb %xmm0, %xmm0, %xmm0
|
||||||
|
; pmovmskb %xmm0, %eax
|
||||||
|
; shrq $8, %rax, %rax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f4(i32x4) -> i8 {
|
||||||
|
block0(v0: i32x4):
|
||||||
|
v1 = vhigh_bits.i8 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movmskps %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
|
function %f5(i64x2) -> i8 {
|
||||||
|
block0(v0: i64x2):
|
||||||
|
v1 = vhigh_bits.i8 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; pushq %rbp
|
||||||
|
; movq %rsp, %rbp
|
||||||
|
; block0:
|
||||||
|
; movmskpd %xmm0, %eax
|
||||||
|
; movq %rbp, %rsp
|
||||||
|
; popq %rbp
|
||||||
|
; ret
|
||||||
|
|
||||||
Reference in New Issue
Block a user