Merge pull request #2278 from akirilov-arm/load_splat

Introduce the Cranelift IR instruction `LoadSplat`
This commit is contained in:
Chris Fallin
2020-10-28 12:54:03 -07:00
committed by GitHub
13 changed files with 554 additions and 222 deletions

View File

@@ -3,7 +3,7 @@
use crate::data_value::DataValue;
use crate::ir::{
condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type,
Inst as IRInst, InstructionData, LibCall, Opcode, Signature, SourceLoc, Type,
};
use crate::isa::x64::abi::*;
use crate::isa::x64::inst::args::*;
@@ -227,6 +227,7 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
dst: Writable<Reg>,
lane: u8,
ty: Type,
srcloc: Option<SourceLoc>,
) {
if !ty.is_float() {
let (sse_op, is64) = match ty.lane_bits() {
@@ -236,13 +237,13 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
64 => (SseOpcode::Pinsrd, true),
_ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
};
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64));
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64, srcloc));
} else if ty == types::F32 {
let sse_op = SseOpcode::Insertps;
// Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
// shifted into bits 5:6).
let lane = 0b00_00_00_00 | lane << 4;
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false));
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false, srcloc));
} else if ty == types::F64 {
let sse_op = match lane {
// Move the lowest quadword in replacement to vector without changing
@@ -256,7 +257,7 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
// Here we use the `xmm_rm_r` encoding because it correctly tells the register
// allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
// encoding formats like `xmm_unary_rm_r` treat it as a `def`.
ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
ctx.emit(Inst::xmm_rm_r(sse_op, src, dst, srcloc));
} else {
panic!("unable to emit insertlane for type: {}", ty)
}
@@ -694,6 +695,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
SseOpcode::Pmuludq,
RegMem::reg(lhs.clone()),
rhs_1,
None,
));
// B' = B
@@ -707,7 +709,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
RegMemImm::imm(32),
lhs_1,
));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pmuludq,
RegMem::reg(rhs),
lhs_1,
None,
));
// B' = B' + A'
// B' = B' << 32
@@ -715,6 +722,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
SseOpcode::Paddq,
RegMem::reg(rhs_1.to_reg()),
lhs_1,
None,
));
ctx.emit(Inst::xmm_rmi_reg(
SseOpcode::Psllq,
@@ -731,11 +739,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
SseOpcode::Pmuludq,
RegMem::reg(lhs.clone()),
rhs_1,
None,
));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Paddq,
RegMem::reg(lhs_1.to_reg()),
rhs_1,
None,
));
ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
return Ok(());
@@ -770,7 +780,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Move the `lhs` to the same register as `dst`.
ctx.emit(Inst::gen_move(dst, lhs, ty));
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
} else {
let is_64 = ty == types::I64;
let alu_op = match op {
@@ -828,7 +838,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Note the flipping of operands: the `rhs` operand is used as the destination instead
// of the `lhs` as in the other bit operations above (e.g. `band`).
ctx.emit(Inst::gen_move(dst, rhs, ty));
ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst));
ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst, None));
}
Opcode::Iabs => {
@@ -884,7 +894,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Move the `lhs` to the same register as `dst`.
ctx.emit(Inst::gen_move(dst, lhs, ty));
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
} else {
panic!("Unsupported type for {} instruction: {}", op, ty);
}
@@ -1007,8 +1017,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
SseOpcode::Pxor,
RegMem::reg(tmp.to_reg()),
tmp,
None,
));
ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp, None));
ctx.emit(Inst::xmm_unary_rm_r(
SseOpcode::Movapd,
RegMem::reg(tmp.to_reg()),
@@ -1561,34 +1572,44 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
};
match condcode {
IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None)),
IntCC::NotEqual => {
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None));
// Emit all 1s into the `tmp` register.
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None));
// Invert the result of the `PCMPEQ*`.
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pxor,
RegMem::from(tmp),
dst,
None,
));
}
IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst, None))
}
IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => {
ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst, None));
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None))
}
IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst, None));
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None));
// Emit all 1s into the `tmp` register.
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None));
// Invert the result of the `PCMPEQ*`.
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pxor,
RegMem::from(tmp),
dst,
None,
));
}
IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst, None));
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None))
}
_ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
}
@@ -1686,7 +1707,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::gen_move(dst, lhs, input_ty));
// Emit the comparison.
ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false));
ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false, None));
}
}
@@ -1899,7 +1920,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ty
),
};
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
}
Opcode::Fmin | Opcode::Fmax => {
@@ -1988,15 +2009,15 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
// Perform min in reverse direction
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1, None));
// Perform min in original direction
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst, None));
// X64 handles propagation of -0's and Nans differently between left and right
// operands. After doing the min in both directions, this OR will
// guarrentee capture of -0's and Nan in our tmp register
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1, None));
// Compare unordered to create mask for lanes containing NaNs and then use
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
@@ -2009,8 +2030,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
dst,
cond.encode(),
false,
None,
));
ctx.emit(Inst::xmm_rm_r(
or_op,
RegMem::reg(dst.to_reg()),
tmp_xmm1,
None,
));
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
// The dst register holds a mask for lanes containing NaNs.
// We take that mask and shift in preparation for creating a different mask
@@ -2022,7 +2049,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Finally we do a nand with the tmp register to produce the final results
// in the dst.
ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
ctx.emit(Inst::xmm_rm_r(
andn_op,
RegMem::reg(tmp_xmm1.to_reg()),
dst,
None,
));
} else {
let (
mov_op,
@@ -2065,23 +2097,43 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
// Perform max in reverse direction.
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
ctx.emit(Inst::xmm_rm_r(
max_op,
RegMem::reg(dst.to_reg()),
tmp_xmm1,
None,
));
// Perform max in original direction.
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst, None));
// Get the difference between the two results and store in tmp.
// Max uses a different approach than min to account for potential
// discrepancies with plus/minus 0.
ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
ctx.emit(Inst::xmm_rm_r(
xor_op,
RegMem::reg(tmp_xmm1.to_reg()),
dst,
None,
));
// X64 handles propagation of -0's and Nans differently between left and right
// operands. After doing the max in both directions, this OR will
// guarentee capture of 0's and Nan in our tmp register.
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
ctx.emit(Inst::xmm_rm_r(
or_op,
RegMem::reg(dst.to_reg()),
tmp_xmm1,
None,
));
// Capture NaNs and sign discrepancies.
ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
ctx.emit(Inst::xmm_rm_r(
sub_op,
RegMem::reg(dst.to_reg()),
tmp_xmm1,
None,
));
// Compare unordered to create mask for lanes containing NaNs and then use
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
@@ -2092,6 +2144,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
dst,
cond.encode(),
false,
None,
));
// The dst register holds a mask for lanes containing NaNs.
@@ -2104,7 +2157,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Finally we do a nand with the tmp register to produce the final results
// in the dst.
ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
ctx.emit(Inst::xmm_rm_r(
andn_op,
RegMem::reg(tmp_xmm1.to_reg()),
dst,
None,
));
}
}
}
@@ -2340,7 +2398,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(inst);
}
ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
ctx.emit(Inst::xmm_rm_r(opcode, src, dst, None));
} else {
// Eventually vector constants should be available in `gen_constant` and this block
// can be merged with the one above (TODO).
@@ -2361,6 +2419,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
tmp,
cond.encode(),
false,
None,
);
ctx.emit(cmpps);
@@ -2380,7 +2439,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
ctx.emit(shift);
// Apply shifted mask (XOR or AND).
let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst, None);
ctx.emit(mask);
} else {
panic!("unexpected type {:?} for Fabs", output_ty);
@@ -2439,14 +2498,20 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
dst,
None,
));
ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst, None));
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2, None));
ctx.emit(Inst::xmm_rm_r(
and_op,
RegMem::reg(tmp_xmm1.to_reg()),
tmp_xmm2,
None,
));
ctx.emit(Inst::xmm_rm_r(
or_op,
RegMem::reg(tmp_xmm2.to_reg()),
dst,
None,
));
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
}
Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
@@ -3167,7 +3232,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// After loading the constructed mask in a temporary register, we use this to
// shuffle the `dst` register (remember that, in this case, it is the same as
// `src` so we disregard this register).
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pshufb,
RegMem::from(tmp),
dst,
None,
));
} else {
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
// them together. This is necessary due to PSHUFB semantics. As in the case above,
@@ -3179,7 +3249,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp1, ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pshufb,
RegMem::from(tmp1),
tmp0,
None,
));
// PSHUFB the second argument, placing zeroes for unused lanes.
let constructed_mask = mask
@@ -3189,11 +3264,21 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
.collect();
let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp2, ty));
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pshufb,
RegMem::from(tmp2),
dst,
None,
));
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
// is not important).
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Orps,
RegMem::from(tmp0),
dst,
None,
));
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
}
@@ -3227,6 +3312,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
SseOpcode::Paddusb,
RegMem::from(zero_mask),
swizzle_mask,
None,
));
// Shuffle `dst` using the fixed-up `swizzle_mask`.
@@ -3234,6 +3320,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
SseOpcode::Pshufb,
RegMem::from(swizzle_mask),
dst,
None,
));
}
@@ -3253,7 +3340,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
debug_assert!(lane < ty.lane_count() as u8);
ctx.emit(Inst::gen_move(dst, in_vec, ty));
emit_insert_lane(ctx, src, dst, lane, ty.lane_type());
emit_insert_lane(ctx, src, dst, lane, ty.lane_type(), None);
}
Opcode::Extractlane => {
@@ -3279,7 +3366,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
_ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
};
let src = RegMem::reg(src);
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit));
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit, None));
} else {
if lane == 0 {
// Remove the extractlane instruction, leaving the float where it is. The upper
@@ -3301,35 +3388,57 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
_ => unreachable!(),
};
let src = RegMem::reg(src);
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false, None));
}
}
}
Opcode::Splat => {
Opcode::Splat | Opcode::LoadSplat => {
let ty = ty.unwrap();
assert_eq!(ty.bits(), 128);
let src_ty = ctx.input_ty(insn, 0);
assert!(src_ty.bits() < 128);
let src = input_to_reg_mem(ctx, inputs[0]);
let (src, srcloc) = match op {
Opcode::Splat => (input_to_reg_mem(ctx, inputs[0]), None),
Opcode::LoadSplat => {
let offset = ctx.data(insn).load_store_offset().unwrap();
let amode = lower_to_amode(ctx, inputs[0], offset);
(RegMem::mem(amode), Some(ctx.srcloc(insn)))
}
_ => unreachable!(),
};
let dst = get_output_reg(ctx, outputs[0]);
// We know that splat will overwrite all of the lanes of `dst` but it takes several
// instructions to do so. Because of the multiple instructions, there is no good way to
// declare `dst` a `def` except with the following pseudo-instruction.
ctx.emit(Inst::xmm_uninit_value(dst));
// TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
// and VPBROADCAST*.
match ty.lane_bits() {
8 => {
emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc);
// Initialize a register with all 0s.
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pxor,
RegMem::from(tmp),
tmp,
srcloc,
));
// Shuffle the lowest byte lane to all other lanes.
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pshufb,
RegMem::from(tmp),
dst,
srcloc,
))
}
16 => {
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc);
emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc);
// Shuffle the lowest two lanes to all other lanes.
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
@@ -3337,10 +3446,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
dst,
0,
false,
srcloc,
))
}
32 => {
emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc);
// Shuffle the lowest lane to all other lanes.
ctx.emit(Inst::xmm_rm_r_imm(
SseOpcode::Pshufd,
@@ -3348,11 +3458,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
dst,
0,
false,
srcloc,
))
}
64 => {
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc);
emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc);
}
_ => panic!("Invalid type to splat: {}", ty),
}
@@ -3386,9 +3497,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
// Initialize a register with all 0s.
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Pxor,
RegMem::from(tmp),
tmp,
None,
));
// Compare to see what lanes are filled with all 1s.
ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp, None));
// Set the ZF if the result is all zeroes.
ctx.emit(Inst::xmm_cmp_rm_r(
SseOpcode::Ptest,