[machinst x64]: lower remaining lane operations--any_true, all_true, splat
This commit is contained in:
@@ -459,6 +459,7 @@ pub enum SseOpcode {
|
|||||||
Psubd,
|
Psubd,
|
||||||
Psubq,
|
Psubq,
|
||||||
Psubw,
|
Psubw,
|
||||||
|
Ptest,
|
||||||
Pxor,
|
Pxor,
|
||||||
Rcpss,
|
Rcpss,
|
||||||
Roundss,
|
Roundss,
|
||||||
@@ -606,6 +607,7 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Pminuw
|
| SseOpcode::Pminuw
|
||||||
| SseOpcode::Pminud
|
| SseOpcode::Pminud
|
||||||
| SseOpcode::Pmulld
|
| SseOpcode::Pmulld
|
||||||
|
| SseOpcode::Ptest
|
||||||
| SseOpcode::Roundss
|
| SseOpcode::Roundss
|
||||||
| SseOpcode::Roundsd => SSE41,
|
| SseOpcode::Roundsd => SSE41,
|
||||||
|
|
||||||
@@ -734,6 +736,7 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Psubd => "psubd",
|
SseOpcode::Psubd => "psubd",
|
||||||
SseOpcode::Psubq => "psubq",
|
SseOpcode::Psubq => "psubq",
|
||||||
SseOpcode::Psubw => "psubw",
|
SseOpcode::Psubw => "psubw",
|
||||||
|
SseOpcode::Ptest => "ptest",
|
||||||
SseOpcode::Pxor => "pxor",
|
SseOpcode::Pxor => "pxor",
|
||||||
SseOpcode::Rcpss => "rcpss",
|
SseOpcode::Rcpss => "rcpss",
|
||||||
SseOpcode::Roundss => "roundss",
|
SseOpcode::Roundss => "roundss",
|
||||||
|
|||||||
@@ -2003,6 +2003,11 @@ pub(crate) fn emit(
|
|||||||
sink.bind_label(constant_end_label);
|
sink.bind_label(constant_end_label);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Inst::XmmFakeDef { .. } => {
|
||||||
|
// This instruction format only exists to declare a register as a `def`; no code is
|
||||||
|
// emitted.
|
||||||
|
}
|
||||||
|
|
||||||
Inst::Xmm_Mov_R_M {
|
Inst::Xmm_Mov_R_M {
|
||||||
op,
|
op,
|
||||||
src,
|
src,
|
||||||
@@ -2087,19 +2092,20 @@ pub(crate) fn emit(
|
|||||||
|
|
||||||
Inst::XMM_Cmp_RM_R { op, src, dst } => {
|
Inst::XMM_Cmp_RM_R { op, src, dst } => {
|
||||||
let rex = RexFlags::clear_w();
|
let rex = RexFlags::clear_w();
|
||||||
let (prefix, opcode) = match op {
|
let (prefix, opcode, len) = match op {
|
||||||
SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E),
|
SseOpcode::Ptest => (LegacyPrefixes::_66, 0x0F3817, 3),
|
||||||
SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E),
|
SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E, 2),
|
||||||
|
SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E, 2),
|
||||||
_ => unimplemented!("Emit xmm cmp rm r"),
|
_ => unimplemented!("Emit xmm cmp rm r"),
|
||||||
};
|
};
|
||||||
|
|
||||||
match src {
|
match src {
|
||||||
RegMem::Reg { reg } => {
|
RegMem::Reg { reg } => {
|
||||||
emit_std_reg_reg(sink, prefix, opcode, 2, *dst, *reg, rex);
|
emit_std_reg_reg(sink, prefix, opcode, len, *dst, *reg, rex);
|
||||||
}
|
}
|
||||||
RegMem::Mem { addr } => {
|
RegMem::Mem { addr } => {
|
||||||
let addr = &addr.finalize(state);
|
let addr = &addr.finalize(state);
|
||||||
emit_std_reg_mem(sink, prefix, opcode, 2, *dst, addr, rex);
|
emit_std_reg_mem(sink, prefix, opcode, len, *dst, addr, rex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -342,6 +342,10 @@ pub enum Inst {
|
|||||||
is64: bool,
|
is64: bool,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// Provides a way to tell the register allocator that the upcoming sequence of instructions
|
||||||
|
/// will overwrite `dst` so it should be considered as a `def`; use with care.
|
||||||
|
XmmFakeDef { dst: Writable<Reg> },
|
||||||
|
|
||||||
// =====================================
|
// =====================================
|
||||||
// Control flow instructions.
|
// Control flow instructions.
|
||||||
/// Direct call: call simm32.
|
/// Direct call: call simm32.
|
||||||
@@ -640,6 +644,11 @@ impl Inst {
|
|||||||
Inst::XMM_RM_R { op, src, dst }
|
Inst::XMM_RM_R { op, src, dst }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn xmm_fake_def(dst: Writable<Reg>) -> Self {
|
||||||
|
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
|
||||||
|
Inst::XmmFakeDef { dst }
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn xmm_mov_r_m(
|
pub(crate) fn xmm_mov_r_m(
|
||||||
op: SseOpcode,
|
op: SseOpcode,
|
||||||
src: Reg,
|
src: Reg,
|
||||||
@@ -1324,6 +1333,12 @@ impl ShowWithRRU for Inst {
|
|||||||
dst.show_rru(mb_rru),
|
dst.show_rru(mb_rru),
|
||||||
),
|
),
|
||||||
|
|
||||||
|
Inst::XmmFakeDef { dst } => format!(
|
||||||
|
"{} {}",
|
||||||
|
ljustify("fake_def".into()),
|
||||||
|
dst.show_rru(mb_rru),
|
||||||
|
),
|
||||||
|
|
||||||
Inst::XmmLoadConstSeq { val, dst, .. } => {
|
Inst::XmmLoadConstSeq { val, dst, .. } => {
|
||||||
format!("load_const ${:?}, {}", val, dst.show_rru(mb_rru),)
|
format!("load_const ${:?}, {}", val, dst.show_rru(mb_rru),)
|
||||||
}
|
}
|
||||||
@@ -1754,6 +1769,7 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
|||||||
collector.add_mod(*dst);
|
collector.add_mod(*dst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Inst::XmmFakeDef { dst } => collector.add_def(*dst),
|
||||||
Inst::XmmLoadConstSeq { dst, .. } => collector.add_def(*dst),
|
Inst::XmmLoadConstSeq { dst, .. } => collector.add_def(*dst),
|
||||||
Inst::XmmMinMaxSeq { lhs, rhs_dst, .. } => {
|
Inst::XmmMinMaxSeq { lhs, rhs_dst, .. } => {
|
||||||
collector.add_use(*lhs);
|
collector.add_use(*lhs);
|
||||||
@@ -2088,6 +2104,9 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
|||||||
src.map_uses(mapper);
|
src.map_uses(mapper);
|
||||||
map_mod(mapper, dst);
|
map_mod(mapper, dst);
|
||||||
}
|
}
|
||||||
|
Inst::XmmFakeDef { ref mut dst, .. } => {
|
||||||
|
map_def(mapper, dst);
|
||||||
|
}
|
||||||
Inst::XmmLoadConstSeq { ref mut dst, .. } => {
|
Inst::XmmLoadConstSeq { ref mut dst, .. } => {
|
||||||
map_def(mapper, dst);
|
map_def(mapper, dst);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2945,6 +2945,138 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Opcode::Splat => {
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
assert_eq!(ty.bits(), 128);
|
||||||
|
let src_ty = ctx.input_ty(insn, 0);
|
||||||
|
assert!(src_ty.bits() < 128);
|
||||||
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
|
||||||
|
fn emit_insert_lane<C: LowerCtx<I = Inst>>(
|
||||||
|
ctx: &mut C,
|
||||||
|
src: RegMem,
|
||||||
|
dst: Writable<Reg>,
|
||||||
|
lane: u8,
|
||||||
|
ty: Type,
|
||||||
|
) {
|
||||||
|
if !ty.is_float() {
|
||||||
|
let (sse_op, is64) = match ty.lane_bits() {
|
||||||
|
8 => (SseOpcode::Pinsrb, false),
|
||||||
|
16 => (SseOpcode::Pinsrw, false),
|
||||||
|
32 => (SseOpcode::Pinsrd, false),
|
||||||
|
64 => (SseOpcode::Pinsrd, true),
|
||||||
|
_ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
|
||||||
|
};
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64));
|
||||||
|
} else if ty == types::F32 {
|
||||||
|
let sse_op = SseOpcode::Insertps;
|
||||||
|
// Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
|
||||||
|
// shifted into bits 5:6).
|
||||||
|
let lane = 0b00_00_00_00 | lane << 4;
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false));
|
||||||
|
} else if ty == types::F64 {
|
||||||
|
let sse_op = match lane {
|
||||||
|
// Move the lowest quadword in replacement to vector without changing
|
||||||
|
// the upper bits.
|
||||||
|
0 => SseOpcode::Movsd,
|
||||||
|
// Move the low 64 bits of replacement vector to the high 64 bits of the
|
||||||
|
// vector.
|
||||||
|
1 => SseOpcode::Movlhps,
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
// Here we use the `xmm_rm_r` encoding because it correctly tells the register
|
||||||
|
// allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
|
||||||
|
// encoding formats like `xmm_unary_rm_r` treat it as a `def`.
|
||||||
|
ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// We know that splat will overwrite all of the lanes of `dst` but it takes several
|
||||||
|
// instructions to do so. Because of the multiple instructions, there is no good way to
|
||||||
|
// declare `dst` a `def` except with the following pseudo-instruction.
|
||||||
|
ctx.emit(Inst::xmm_fake_def(dst));
|
||||||
|
match ty.lane_bits() {
|
||||||
|
8 => {
|
||||||
|
emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
|
||||||
|
// Initialize a register with all 0s.
|
||||||
|
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
|
||||||
|
// Shuffle the lowest byte lane to all other lanes.
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
|
||||||
|
}
|
||||||
|
16 => {
|
||||||
|
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
|
||||||
|
emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
|
||||||
|
// Shuffle the lowest two lanes to all other lanes.
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(
|
||||||
|
SseOpcode::Pshufd,
|
||||||
|
RegMem::from(dst),
|
||||||
|
dst,
|
||||||
|
0,
|
||||||
|
false,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
32 => {
|
||||||
|
emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
|
||||||
|
// Shuffle the lowest lane to all other lanes.
|
||||||
|
ctx.emit(Inst::xmm_rm_r_imm(
|
||||||
|
SseOpcode::Pshufd,
|
||||||
|
RegMem::from(dst),
|
||||||
|
dst,
|
||||||
|
0,
|
||||||
|
false,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
64 => {
|
||||||
|
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
|
||||||
|
emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
|
||||||
|
}
|
||||||
|
_ => panic!("Invalid type to splat: {}", ty),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Opcode::VanyTrue => {
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
let src_ty = ctx.input_ty(insn, 0);
|
||||||
|
assert_eq!(src_ty.bits(), 128);
|
||||||
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
|
// Set the ZF if the result is all zeroes.
|
||||||
|
ctx.emit(Inst::xmm_cmp_rm_r(SseOpcode::Ptest, RegMem::reg(src), src));
|
||||||
|
// If the ZF is not set, place a 1 in `dst`.
|
||||||
|
ctx.emit(Inst::setcc(CC::NZ, dst));
|
||||||
|
}
|
||||||
|
|
||||||
|
Opcode::VallTrue => {
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
let src_ty = ctx.input_ty(insn, 0);
|
||||||
|
assert_eq!(src_ty.bits(), 128);
|
||||||
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
||||||
|
|
||||||
|
let eq = |ty: Type| match ty.lane_bits() {
|
||||||
|
8 => SseOpcode::Pcmpeqb,
|
||||||
|
16 => SseOpcode::Pcmpeqw,
|
||||||
|
32 => SseOpcode::Pcmpeqd,
|
||||||
|
64 => SseOpcode::Pcmpeqq,
|
||||||
|
_ => panic!("Unable to find an instruction for {} for type: {}", op, ty),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Initialize a register with all 0s.
|
||||||
|
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
||||||
|
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
|
||||||
|
// Compare to see what lanes are filled with all 1s.
|
||||||
|
ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
|
||||||
|
// Set the ZF if the result is all zeroes.
|
||||||
|
ctx.emit(Inst::xmm_cmp_rm_r(
|
||||||
|
SseOpcode::Ptest,
|
||||||
|
RegMem::from(tmp),
|
||||||
|
tmp.to_reg(),
|
||||||
|
));
|
||||||
|
// If the ZF is set, place a 1 in `dst`.
|
||||||
|
ctx.emit(Inst::setcc(CC::Z, dst));
|
||||||
|
}
|
||||||
|
|
||||||
Opcode::IaddImm
|
Opcode::IaddImm
|
||||||
| Opcode::ImulImm
|
| Opcode::ImulImm
|
||||||
| Opcode::UdivImm
|
| Opcode::UdivImm
|
||||||
|
|||||||
Reference in New Issue
Block a user