Merge pull request #2278 from akirilov-arm/load_splat
Introduce the Cranelift IR instruction `LoadSplat`
This commit is contained in:
@@ -396,6 +396,7 @@ fn define_simd(
|
|||||||
let insertlane = insts.by_name("insertlane");
|
let insertlane = insts.by_name("insertlane");
|
||||||
let ishl = insts.by_name("ishl");
|
let ishl = insts.by_name("ishl");
|
||||||
let ishl_imm = insts.by_name("ishl_imm");
|
let ishl_imm = insts.by_name("ishl_imm");
|
||||||
|
let load_splat = insts.by_name("load_splat");
|
||||||
let raw_bitcast = insts.by_name("raw_bitcast");
|
let raw_bitcast = insts.by_name("raw_bitcast");
|
||||||
let scalar_to_vector = insts.by_name("scalar_to_vector");
|
let scalar_to_vector = insts.by_name("scalar_to_vector");
|
||||||
let splat = insts.by_name("splat");
|
let splat = insts.by_name("splat");
|
||||||
@@ -820,6 +821,7 @@ fn define_simd(
|
|||||||
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
|
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
|
||||||
narrow.custom_legalize(fmin, "expand_minmax_vector");
|
narrow.custom_legalize(fmin, "expand_minmax_vector");
|
||||||
narrow.custom_legalize(fmax, "expand_minmax_vector");
|
narrow.custom_legalize(fmax, "expand_minmax_vector");
|
||||||
|
narrow.custom_legalize(load_splat, "expand_load_splat");
|
||||||
|
|
||||||
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
|
||||||
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
|
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
|
||||||
|
|||||||
@@ -4459,5 +4459,24 @@ pub(crate) fn define(
|
|||||||
.other_side_effects(true),
|
.other_side_effects(true),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let Offset = &Operand::new("Offset", &imm.offset32).with_doc("Byte offset from base address");
|
||||||
|
let a = &Operand::new("a", TxN);
|
||||||
|
|
||||||
|
ig.push(
|
||||||
|
Inst::new(
|
||||||
|
"load_splat",
|
||||||
|
r#"
|
||||||
|
Load an element from memory at ``p + Offset`` and return a vector
|
||||||
|
whose lanes are all set to that element.
|
||||||
|
|
||||||
|
This is equivalent to ``load`` followed by ``splat``.
|
||||||
|
"#,
|
||||||
|
&formats.load,
|
||||||
|
)
|
||||||
|
.operands_in(vec![MemFlags, p, Offset])
|
||||||
|
.operands_out(vec![a])
|
||||||
|
.can_load(true),
|
||||||
|
);
|
||||||
|
|
||||||
ig.build()
|
ig.build()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -680,4 +680,19 @@ impl VectorSize {
|
|||||||
_ => *self,
|
_ => *self,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the encoding bits that are used by some SIMD instructions
|
||||||
|
/// for a particular operand size.
|
||||||
|
pub fn enc_size(&self) -> (u32, u32) {
|
||||||
|
let q = self.is_128bits() as u32;
|
||||||
|
let size = match self.lane_size() {
|
||||||
|
ScalarSize::Size8 => 0b00,
|
||||||
|
ScalarSize::Size16 => 0b01,
|
||||||
|
ScalarSize::Size32 => 0b10,
|
||||||
|
ScalarSize::Size64 => 0b11,
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
(q, size)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
|
|||||||
(op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
|
(op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
|
||||||
|
debug_assert_eq!(q & 0b1, q);
|
||||||
|
debug_assert_eq!(size & 0b11, size);
|
||||||
|
0b0_0_0011010_10_00000_110_0_00_00000_00000
|
||||||
|
| q << 30
|
||||||
|
| size << 10
|
||||||
|
| machreg_to_gpr(rn) << 5
|
||||||
|
| machreg_to_vec(rt.to_reg())
|
||||||
|
}
|
||||||
|
|
||||||
fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
|
fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
|
||||||
(top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
|
(top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
|
||||||
}
|
}
|
||||||
@@ -1381,14 +1391,7 @@ impl MachInstEmit for Inst {
|
|||||||
sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
|
sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
|
||||||
}
|
}
|
||||||
&Inst::VecMisc { op, rd, rn, size } => {
|
&Inst::VecMisc { op, rd, rn, size } => {
|
||||||
let enc_size = match size.lane_size() {
|
let (q, enc_size) = size.enc_size();
|
||||||
ScalarSize::Size8 => 0b00,
|
|
||||||
ScalarSize::Size16 => 0b01,
|
|
||||||
ScalarSize::Size32 => 0b10,
|
|
||||||
ScalarSize::Size64 => 0b11,
|
|
||||||
_ => unreachable!(),
|
|
||||||
};
|
|
||||||
let q = if size.is_128bits() { 1 } else { 0 };
|
|
||||||
let (u, bits_12_16, size) = match op {
|
let (u, bits_12_16, size) = match op {
|
||||||
VecMisc2::Not => (0b1, 0b00101, 0b00),
|
VecMisc2::Not => (0b1, 0b00101, 0b00),
|
||||||
VecMisc2::Neg => (0b1, 0b01011, enc_size),
|
VecMisc2::Neg => (0b1, 0b01011, enc_size),
|
||||||
@@ -1831,13 +1834,7 @@ impl MachInstEmit for Inst {
|
|||||||
alu_op,
|
alu_op,
|
||||||
size,
|
size,
|
||||||
} => {
|
} => {
|
||||||
let enc_size = match size.lane_size() {
|
let (q, enc_size) = size.enc_size();
|
||||||
ScalarSize::Size8 => 0b00,
|
|
||||||
ScalarSize::Size16 => 0b01,
|
|
||||||
ScalarSize::Size32 => 0b10,
|
|
||||||
ScalarSize::Size64 => 0b11,
|
|
||||||
_ => unreachable!(),
|
|
||||||
};
|
|
||||||
let is_float = match alu_op {
|
let is_float = match alu_op {
|
||||||
VecALUOp::Fcmeq
|
VecALUOp::Fcmeq
|
||||||
| VecALUOp::Fcmgt
|
| VecALUOp::Fcmgt
|
||||||
@@ -1851,6 +1848,7 @@ impl MachInstEmit for Inst {
|
|||||||
_ => false,
|
_ => false,
|
||||||
};
|
};
|
||||||
let enc_float_size = match (is_float, size) {
|
let enc_float_size = match (is_float, size) {
|
||||||
|
(true, VectorSize::Size32x2) => 0b0,
|
||||||
(true, VectorSize::Size32x4) => 0b0,
|
(true, VectorSize::Size32x4) => 0b0,
|
||||||
(true, VectorSize::Size64x2) => 0b1,
|
(true, VectorSize::Size64x2) => 0b1,
|
||||||
(true, _) => unimplemented!(),
|
(true, _) => unimplemented!(),
|
||||||
@@ -1858,46 +1856,46 @@ impl MachInstEmit for Inst {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let (top11, bit15_10) = match alu_op {
|
let (top11, bit15_10) = match alu_op {
|
||||||
VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
|
VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
|
||||||
VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
|
VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011),
|
||||||
VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
|
VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011),
|
||||||
VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
|
VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011),
|
||||||
VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
|
VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011),
|
||||||
VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
|
VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111),
|
||||||
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
|
VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101),
|
||||||
VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
|
VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101),
|
||||||
VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
|
VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111),
|
||||||
VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001),
|
VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001),
|
||||||
VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001),
|
VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001),
|
||||||
VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001),
|
VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001),
|
||||||
// The following logical instructions operate on bytes, so are not encoded differently
|
// The following logical instructions operate on bytes, so are not encoded differently
|
||||||
// for the different vector types.
|
// for the different vector types.
|
||||||
VecALUOp::And => (0b010_01110_00_1, 0b000111),
|
VecALUOp::And => (0b000_01110_00_1, 0b000111),
|
||||||
VecALUOp::Bic => (0b010_01110_01_1, 0b000111),
|
VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
|
||||||
VecALUOp::Orr => (0b010_01110_10_1, 0b000111),
|
VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
|
||||||
VecALUOp::Eor => (0b011_01110_00_1, 0b000111),
|
VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
|
||||||
VecALUOp::Bsl => (0b011_01110_01_1, 0b000111),
|
VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
|
||||||
VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
|
VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001),
|
||||||
VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
|
VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001),
|
||||||
VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
|
VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001),
|
||||||
VecALUOp::Mul => {
|
VecALUOp::Mul => {
|
||||||
debug_assert_ne!(size, VectorSize::Size64x2);
|
debug_assert_ne!(size, VectorSize::Size64x2);
|
||||||
(0b010_01110_00_1 | enc_size << 1, 0b100111)
|
(0b000_01110_00_1 | enc_size << 1, 0b100111)
|
||||||
}
|
}
|
||||||
VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001),
|
VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001),
|
||||||
VecALUOp::Ushl => (0b011_01110_00_1 | enc_size << 1, 0b010001),
|
VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001),
|
||||||
VecALUOp::Umin => (0b011_01110_00_1 | enc_size << 1, 0b011011),
|
VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011),
|
||||||
VecALUOp::Smin => (0b010_01110_00_1 | enc_size << 1, 0b011011),
|
VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011),
|
||||||
VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001),
|
VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001),
|
||||||
VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001),
|
VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001),
|
||||||
VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101),
|
VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101),
|
||||||
VecALUOp::Fadd => (0b010_01110_00_1, 0b110101),
|
VecALUOp::Fadd => (0b000_01110_00_1, 0b110101),
|
||||||
VecALUOp::Fsub => (0b010_01110_10_1, 0b110101),
|
VecALUOp::Fsub => (0b000_01110_10_1, 0b110101),
|
||||||
VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111),
|
VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111),
|
||||||
VecALUOp::Fmax => (0b010_01110_00_1, 0b111101),
|
VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
|
||||||
VecALUOp::Fmin => (0b010_01110_10_1, 0b111101),
|
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
|
||||||
VecALUOp::Fmul => (0b011_01110_00_1, 0b110111),
|
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
|
||||||
VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111),
|
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
|
||||||
VecALUOp::Umlal => {
|
VecALUOp::Umlal => {
|
||||||
debug_assert!(!size.is_128bits());
|
debug_assert!(!size.is_128bits());
|
||||||
(0b001_01110_00_1 | enc_size << 1, 0b100000)
|
(0b001_01110_00_1 | enc_size << 1, 0b100000)
|
||||||
@@ -1905,12 +1903,27 @@ impl MachInstEmit for Inst {
|
|||||||
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
|
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
|
||||||
};
|
};
|
||||||
let top11 = if is_float {
|
let top11 = if is_float {
|
||||||
top11 | enc_float_size << 1
|
top11 | (q << 9) | enc_float_size << 1
|
||||||
} else {
|
} else {
|
||||||
top11
|
top11 | (q << 9)
|
||||||
};
|
};
|
||||||
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
|
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
|
||||||
}
|
}
|
||||||
|
&Inst::VecLoadReplicate {
|
||||||
|
rd,
|
||||||
|
rn,
|
||||||
|
size,
|
||||||
|
srcloc,
|
||||||
|
} => {
|
||||||
|
let (q, size) = size.enc_size();
|
||||||
|
|
||||||
|
if let Some(srcloc) = srcloc {
|
||||||
|
// Register the offset at which the actual load instruction starts.
|
||||||
|
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
|
||||||
|
}
|
||||||
|
|
||||||
|
sink.put4(enc_ldst_vec(q, size, rn, rd));
|
||||||
|
}
|
||||||
&Inst::MovToNZCV { rn } => {
|
&Inst::MovToNZCV { rn } => {
|
||||||
sink.put4(0xd51b4200 | machreg_to_gpr(rn));
|
sink.put4(0xd51b4200 | machreg_to_gpr(rn));
|
||||||
}
|
}
|
||||||
@@ -2195,9 +2208,12 @@ impl MachInstEmit for Inst {
|
|||||||
inst.emit(sink, emit_info, state);
|
inst.emit(sink, emit_info, state);
|
||||||
}
|
}
|
||||||
|
|
||||||
let (reg, offset) = match mem {
|
let (reg, index_reg, offset) = match mem {
|
||||||
AMode::Unscaled(r, simm9) => (r, simm9.value()),
|
AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
|
||||||
AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
|
AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
|
||||||
|
AMode::UnsignedOffset(r, uimm12scaled) => {
|
||||||
|
(r, None, uimm12scaled.value() as i32)
|
||||||
|
}
|
||||||
_ => panic!("Unsupported case for LoadAddr: {:?}", mem),
|
_ => panic!("Unsupported case for LoadAddr: {:?}", mem),
|
||||||
};
|
};
|
||||||
let abs_offset = if offset < 0 {
|
let abs_offset = if offset < 0 {
|
||||||
@@ -2211,9 +2227,22 @@ impl MachInstEmit for Inst {
|
|||||||
ALUOp::Add64
|
ALUOp::Add64
|
||||||
};
|
};
|
||||||
|
|
||||||
if offset == 0 {
|
if let Some((idx, extendop)) = index_reg {
|
||||||
|
let add = Inst::AluRRRExtend {
|
||||||
|
alu_op: ALUOp::Add64,
|
||||||
|
rd,
|
||||||
|
rn: reg,
|
||||||
|
rm: idx,
|
||||||
|
extendop,
|
||||||
|
};
|
||||||
|
|
||||||
|
add.emit(sink, emit_info, state);
|
||||||
|
} else if offset == 0 {
|
||||||
|
if reg != rd.to_reg() {
|
||||||
let mov = Inst::mov(rd, reg);
|
let mov = Inst::mov(rd, reg);
|
||||||
|
|
||||||
mov.emit(sink, emit_info, state);
|
mov.emit(sink, emit_info, state);
|
||||||
|
}
|
||||||
} else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
|
} else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
|
||||||
let add = Inst::AluRRImm12 {
|
let add = Inst::AluRRImm12 {
|
||||||
alu_op,
|
alu_op,
|
||||||
|
|||||||
@@ -2533,10 +2533,10 @@ fn test_aarch64_binemit() {
|
|||||||
rd: writable_vreg(28),
|
rd: writable_vreg(28),
|
||||||
rn: vreg(12),
|
rn: vreg(12),
|
||||||
rm: vreg(4),
|
rm: vreg(4),
|
||||||
size: VectorSize::Size32x4,
|
size: VectorSize::Size32x2,
|
||||||
},
|
},
|
||||||
"9CE5244E",
|
"9CE5240E",
|
||||||
"fcmeq v28.4s, v12.4s, v4.4s",
|
"fcmeq v28.2s, v12.2s, v4.2s",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
@@ -2965,10 +2965,10 @@ fn test_aarch64_binemit() {
|
|||||||
rd: writable_vreg(6),
|
rd: writable_vreg(6),
|
||||||
rn: vreg(9),
|
rn: vreg(9),
|
||||||
rm: vreg(8),
|
rm: vreg(8),
|
||||||
size: VectorSize::Size8x16,
|
size: VectorSize::Size8x8,
|
||||||
},
|
},
|
||||||
"2665286E",
|
"2665282E",
|
||||||
"umax v6.16b, v9.16b, v8.16b",
|
"umax v6.8b, v9.8b, v8.8b",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
@@ -3805,6 +3805,28 @@ fn test_aarch64_binemit() {
|
|||||||
"tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
|
"tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecLoadReplicate {
|
||||||
|
rd: writable_vreg(31),
|
||||||
|
rn: xreg(0),
|
||||||
|
srcloc: None,
|
||||||
|
size: VectorSize::Size64x2,
|
||||||
|
},
|
||||||
|
"1FCC404D",
|
||||||
|
"ld1r { v31.2d }, [x0]",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::VecLoadReplicate {
|
||||||
|
rd: writable_vreg(0),
|
||||||
|
rn: xreg(25),
|
||||||
|
srcloc: None,
|
||||||
|
size: VectorSize::Size8x8,
|
||||||
|
},
|
||||||
|
"20C3400D",
|
||||||
|
"ld1r { v0.8b }, [x25]",
|
||||||
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::Extend {
|
Inst::Extend {
|
||||||
rd: writable_xreg(1),
|
rd: writable_xreg(1),
|
||||||
|
|||||||
@@ -1021,6 +1021,14 @@ pub enum Inst {
|
|||||||
is_extension: bool,
|
is_extension: bool,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// Load an element and replicate to all lanes of a vector.
|
||||||
|
VecLoadReplicate {
|
||||||
|
rd: Writable<Reg>,
|
||||||
|
rn: Reg,
|
||||||
|
size: VectorSize,
|
||||||
|
srcloc: Option<SourceLoc>,
|
||||||
|
},
|
||||||
|
|
||||||
/// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
|
/// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
|
||||||
MovToNZCV {
|
MovToNZCV {
|
||||||
rn: Reg,
|
rn: Reg,
|
||||||
@@ -1664,7 +1672,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
|||||||
collector.add_def(rd);
|
collector.add_def(rd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
&Inst::VecLoadReplicate { rd, rn, .. } => {
|
||||||
|
collector.add_def(rd);
|
||||||
|
collector.add_use(rn);
|
||||||
|
}
|
||||||
&Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
|
&Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
|
||||||
collector.add_use(rn);
|
collector.add_use(rn);
|
||||||
collector.add_use(rm);
|
collector.add_use(rm);
|
||||||
@@ -1817,8 +1828,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
|||||||
&Inst::LoadExtName { rd, .. } => {
|
&Inst::LoadExtName { rd, .. } => {
|
||||||
collector.add_def(rd);
|
collector.add_def(rd);
|
||||||
}
|
}
|
||||||
&Inst::LoadAddr { rd, mem: _ } => {
|
&Inst::LoadAddr { rd, ref mem } => {
|
||||||
collector.add_def(rd);
|
collector.add_def(rd);
|
||||||
|
memarg_regs(mem, collector);
|
||||||
}
|
}
|
||||||
&Inst::VirtualSPOffsetAdj { .. } => {}
|
&Inst::VirtualSPOffsetAdj { .. } => {}
|
||||||
&Inst::EmitIsland { .. } => {}
|
&Inst::EmitIsland { .. } => {}
|
||||||
@@ -2262,6 +2274,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
|||||||
map_def(mapper, rd);
|
map_def(mapper, rd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
&mut Inst::VecLoadReplicate {
|
||||||
|
ref mut rd,
|
||||||
|
ref mut rn,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
map_def(mapper, rd);
|
||||||
|
map_use(mapper, rn);
|
||||||
|
}
|
||||||
&mut Inst::FpuCmp32 {
|
&mut Inst::FpuCmp32 {
|
||||||
ref mut rn,
|
ref mut rn,
|
||||||
ref mut rm,
|
ref mut rm,
|
||||||
@@ -3507,6 +3527,12 @@ impl Inst {
|
|||||||
let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
|
let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
|
||||||
format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
|
format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
|
||||||
}
|
}
|
||||||
|
&Inst::VecLoadReplicate { rd, rn, size, .. } => {
|
||||||
|
let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
|
||||||
|
let rn = rn.show_rru(mb_rru);
|
||||||
|
|
||||||
|
format!("ld1r {{ {} }}, [{}]", rd, rn)
|
||||||
|
}
|
||||||
&Inst::MovToNZCV { rn } => {
|
&Inst::MovToNZCV { rn } => {
|
||||||
let rn = rn.show_rru(mb_rru);
|
let rn = rn.show_rru(mb_rru);
|
||||||
format!("msr nzcv, {}", rn)
|
format!("msr nzcv, {}", rn)
|
||||||
|
|||||||
@@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Opcode::LoadSplat => {
|
||||||
|
let off = ctx.data(insn).load_store_offset().unwrap();
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
|
||||||
|
let memflags = ctx.memflags(insn).expect("memory flags");
|
||||||
|
let rd = get_output_reg(ctx, outputs[0]);
|
||||||
|
let size = VectorSize::from_ty(ty);
|
||||||
|
let srcloc = if memflags.notrap() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(ctx.srcloc(insn))
|
||||||
|
};
|
||||||
|
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
|
||||||
|
|
||||||
|
ctx.emit(Inst::LoadAddr { rd: tmp, mem });
|
||||||
|
ctx.emit(Inst::VecLoadReplicate {
|
||||||
|
rd,
|
||||||
|
rn: tmp.to_reg(),
|
||||||
|
size,
|
||||||
|
srcloc,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
Opcode::Store
|
Opcode::Store
|
||||||
| Opcode::Istore8
|
| Opcode::Istore8
|
||||||
| Opcode::Istore16
|
| Opcode::Istore16
|
||||||
|
|||||||
@@ -1728,6 +1728,7 @@ pub(crate) fn emit(
|
|||||||
op,
|
op,
|
||||||
src: src_e,
|
src: src_e,
|
||||||
dst: reg_g,
|
dst: reg_g,
|
||||||
|
srcloc,
|
||||||
} => {
|
} => {
|
||||||
let rex = RexFlags::clear_w();
|
let rex = RexFlags::clear_w();
|
||||||
let (prefix, opcode, length) = match op {
|
let (prefix, opcode, length) = match op {
|
||||||
@@ -1820,6 +1821,10 @@ pub(crate) fn emit(
|
|||||||
emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex);
|
emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex);
|
||||||
}
|
}
|
||||||
RegMem::Mem { addr } => {
|
RegMem::Mem { addr } => {
|
||||||
|
if let Some(srcloc) = *srcloc {
|
||||||
|
// Register the offset at which the actual load instruction starts.
|
||||||
|
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
|
||||||
|
}
|
||||||
let addr = &addr.finalize(state);
|
let addr = &addr.finalize(state);
|
||||||
emit_std_reg_mem(sink, prefix, opcode, length, reg_g.to_reg(), addr, rex);
|
emit_std_reg_mem(sink, prefix, opcode, length, reg_g.to_reg(), addr, rex);
|
||||||
}
|
}
|
||||||
@@ -1890,7 +1895,7 @@ pub(crate) fn emit(
|
|||||||
// and negative zero. These instructions merge the sign bits in that
|
// and negative zero. These instructions merge the sign bits in that
|
||||||
// case, and are no-ops otherwise.
|
// case, and are no-ops otherwise.
|
||||||
let op = if *is_min { or_op } else { and_op };
|
let op = if *is_min { or_op } else { and_op };
|
||||||
let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst);
|
let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst, None);
|
||||||
inst.emit(sink, info, state);
|
inst.emit(sink, info, state);
|
||||||
|
|
||||||
let inst = Inst::jmp_known(done);
|
let inst = Inst::jmp_known(done);
|
||||||
@@ -1900,13 +1905,13 @@ pub(crate) fn emit(
|
|||||||
// read-only operand: perform an addition between the two operands, which has the
|
// read-only operand: perform an addition between the two operands, which has the
|
||||||
// desired NaN propagation effects.
|
// desired NaN propagation effects.
|
||||||
sink.bind_label(propagate_nan);
|
sink.bind_label(propagate_nan);
|
||||||
let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst);
|
let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst, None);
|
||||||
inst.emit(sink, info, state);
|
inst.emit(sink, info, state);
|
||||||
|
|
||||||
one_way_jmp(sink, CC::P, done);
|
one_way_jmp(sink, CC::P, done);
|
||||||
|
|
||||||
sink.bind_label(do_min_max);
|
sink.bind_label(do_min_max);
|
||||||
let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst);
|
let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst, None);
|
||||||
inst.emit(sink, info, state);
|
inst.emit(sink, info, state);
|
||||||
|
|
||||||
sink.bind_label(done);
|
sink.bind_label(done);
|
||||||
@@ -1917,7 +1922,8 @@ pub(crate) fn emit(
|
|||||||
src,
|
src,
|
||||||
dst,
|
dst,
|
||||||
imm,
|
imm,
|
||||||
is64: w,
|
is64,
|
||||||
|
srcloc,
|
||||||
} => {
|
} => {
|
||||||
let (prefix, opcode, len) = match op {
|
let (prefix, opcode, len) = match op {
|
||||||
SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
|
SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
|
||||||
@@ -1934,7 +1940,7 @@ pub(crate) fn emit(
|
|||||||
SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
|
SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
|
||||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
};
|
};
|
||||||
let rex = if *w {
|
let rex = if *is64 {
|
||||||
RexFlags::set_w()
|
RexFlags::set_w()
|
||||||
} else {
|
} else {
|
||||||
RexFlags::clear_w()
|
RexFlags::clear_w()
|
||||||
@@ -1956,6 +1962,10 @@ pub(crate) fn emit(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
RegMem::Mem { addr } => {
|
RegMem::Mem { addr } => {
|
||||||
|
if let Some(srcloc) = *srcloc {
|
||||||
|
// Register the offset at which the actual load instruction starts.
|
||||||
|
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
|
||||||
|
}
|
||||||
let addr = &addr.finalize(state);
|
let addr = &addr.finalize(state);
|
||||||
assert!(
|
assert!(
|
||||||
!regs_swapped,
|
!regs_swapped,
|
||||||
@@ -1964,7 +1974,7 @@ pub(crate) fn emit(
|
|||||||
emit_std_reg_mem(sink, prefix, opcode, len, dst.to_reg(), addr, rex);
|
emit_std_reg_mem(sink, prefix, opcode, len, dst.to_reg(), addr, rex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sink.put1(*imm)
|
sink.put1(*imm);
|
||||||
}
|
}
|
||||||
|
|
||||||
Inst::XmmLoadConstSeq { val, dst, ty } => {
|
Inst::XmmLoadConstSeq { val, dst, ty } => {
|
||||||
@@ -2189,7 +2199,7 @@ pub(crate) fn emit(
|
|||||||
} else {
|
} else {
|
||||||
SseOpcode::Addss
|
SseOpcode::Addss
|
||||||
};
|
};
|
||||||
let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst);
|
let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst, None);
|
||||||
inst.emit(sink, info, state);
|
inst.emit(sink, info, state);
|
||||||
|
|
||||||
sink.bind_label(done);
|
sink.bind_label(done);
|
||||||
@@ -2296,8 +2306,12 @@ pub(crate) fn emit(
|
|||||||
// If the input was positive, saturate to INT_MAX.
|
// If the input was positive, saturate to INT_MAX.
|
||||||
|
|
||||||
// Zero out tmp_xmm.
|
// Zero out tmp_xmm.
|
||||||
let inst =
|
let inst = Inst::xmm_rm_r(
|
||||||
Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
|
SseOpcode::Xorpd,
|
||||||
|
RegMem::reg(tmp_xmm.to_reg()),
|
||||||
|
*tmp_xmm,
|
||||||
|
None,
|
||||||
|
);
|
||||||
inst.emit(sink, info, state);
|
inst.emit(sink, info, state);
|
||||||
|
|
||||||
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
|
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
|
||||||
@@ -2368,8 +2382,12 @@ pub(crate) fn emit(
|
|||||||
sink.bind_label(check_positive);
|
sink.bind_label(check_positive);
|
||||||
|
|
||||||
// Zero out the tmp_xmm register.
|
// Zero out the tmp_xmm register.
|
||||||
let inst =
|
let inst = Inst::xmm_rm_r(
|
||||||
Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
|
SseOpcode::Xorpd,
|
||||||
|
RegMem::reg(tmp_xmm.to_reg()),
|
||||||
|
*tmp_xmm,
|
||||||
|
None,
|
||||||
|
);
|
||||||
inst.emit(sink, info, state);
|
inst.emit(sink, info, state);
|
||||||
|
|
||||||
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
|
let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
|
||||||
@@ -2523,7 +2541,7 @@ pub(crate) fn emit(
|
|||||||
|
|
||||||
sink.bind_label(handle_large);
|
sink.bind_label(handle_large);
|
||||||
|
|
||||||
let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src);
|
let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src, None);
|
||||||
inst.emit(sink, info, state);
|
inst.emit(sink, info, state);
|
||||||
|
|
||||||
let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size);
|
let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size);
|
||||||
|
|||||||
@@ -2983,12 +2983,12 @@ fn test_x64_emit() {
|
|||||||
// XMM_RM_R: float binary ops
|
// XMM_RM_R: float binary ops
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0),
|
Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0, None),
|
||||||
"F30F58C1",
|
"F30F58C1",
|
||||||
"addss %xmm1, %xmm0",
|
"addss %xmm1, %xmm0",
|
||||||
));
|
));
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13),
|
Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13, None),
|
||||||
"F3450F58EB",
|
"F3450F58EB",
|
||||||
"addss %xmm11, %xmm13",
|
"addss %xmm11, %xmm13",
|
||||||
));
|
));
|
||||||
@@ -2997,23 +2997,24 @@ fn test_x64_emit() {
|
|||||||
SseOpcode::Addss,
|
SseOpcode::Addss,
|
||||||
RegMem::mem(Amode::imm_reg_reg_shift(123, r10, rdx, 2)),
|
RegMem::mem(Amode::imm_reg_reg_shift(123, r10, rdx, 2)),
|
||||||
w_xmm0,
|
w_xmm0,
|
||||||
|
None,
|
||||||
),
|
),
|
||||||
"F3410F5844927B",
|
"F3410F5844927B",
|
||||||
"addss 123(%r10,%rdx,4), %xmm0",
|
"addss 123(%r10,%rdx,4), %xmm0",
|
||||||
));
|
));
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4),
|
Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4, None),
|
||||||
"F2410F58E7",
|
"F2410F58E7",
|
||||||
"addsd %xmm15, %xmm4",
|
"addsd %xmm15, %xmm4",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1),
|
Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1, None),
|
||||||
"F30F5CC8",
|
"F30F5CC8",
|
||||||
"subss %xmm0, %xmm1",
|
"subss %xmm0, %xmm1",
|
||||||
));
|
));
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1),
|
Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1, None),
|
||||||
"F3410F5CCC",
|
"F3410F5CCC",
|
||||||
"subss %xmm12, %xmm1",
|
"subss %xmm12, %xmm1",
|
||||||
));
|
));
|
||||||
@@ -3022,57 +3023,58 @@ fn test_x64_emit() {
|
|||||||
SseOpcode::Subss,
|
SseOpcode::Subss,
|
||||||
RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rax, 3)),
|
RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rax, 3)),
|
||||||
w_xmm10,
|
w_xmm10,
|
||||||
|
None,
|
||||||
),
|
),
|
||||||
"F3450F5C94C241010000",
|
"F3450F5C94C241010000",
|
||||||
"subss 321(%r10,%rax,8), %xmm10",
|
"subss 321(%r10,%rax,8), %xmm10",
|
||||||
));
|
));
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14),
|
Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14, None),
|
||||||
"F2440F5CF5",
|
"F2440F5CF5",
|
||||||
"subsd %xmm5, %xmm14",
|
"subsd %xmm5, %xmm14",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4),
|
Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4, None),
|
||||||
"F30F59E5",
|
"F30F59E5",
|
||||||
"mulss %xmm5, %xmm4",
|
"mulss %xmm5, %xmm4",
|
||||||
));
|
));
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4),
|
Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4, None),
|
||||||
"F20F59E5",
|
"F20F59E5",
|
||||||
"mulsd %xmm5, %xmm4",
|
"mulsd %xmm5, %xmm4",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7),
|
Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7, None),
|
||||||
"F3410F5EF8",
|
"F3410F5EF8",
|
||||||
"divss %xmm8, %xmm7",
|
"divss %xmm8, %xmm7",
|
||||||
));
|
));
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4),
|
Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4, None),
|
||||||
"F20F5EE5",
|
"F20F5EE5",
|
||||||
"divsd %xmm5, %xmm4",
|
"divsd %xmm5, %xmm4",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12),
|
Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12, None),
|
||||||
"440F54E3",
|
"440F54E3",
|
||||||
"andps %xmm3, %xmm12",
|
"andps %xmm3, %xmm12",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11),
|
Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11, None),
|
||||||
"440F55DC",
|
"440F55DC",
|
||||||
"andnps %xmm4, %xmm11",
|
"andnps %xmm4, %xmm11",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15),
|
Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15, None),
|
||||||
"440F56F9",
|
"440F56F9",
|
||||||
"orps %xmm1, %xmm15",
|
"orps %xmm1, %xmm15",
|
||||||
));
|
));
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4),
|
Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4, None),
|
||||||
"0F56E5",
|
"0F56E5",
|
||||||
"orps %xmm5, %xmm4",
|
"orps %xmm5, %xmm4",
|
||||||
));
|
));
|
||||||
@@ -3081,211 +3083,211 @@ fn test_x64_emit() {
|
|||||||
// XMM_RM_R: Integer Packed
|
// XMM_RM_R: Integer Packed
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5),
|
Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5, None),
|
||||||
"66410FFCE9",
|
"66410FFCE9",
|
||||||
"paddb %xmm9, %xmm5",
|
"paddb %xmm9, %xmm5",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6),
|
Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6, None),
|
||||||
"660FFDF7",
|
"660FFDF7",
|
||||||
"paddw %xmm7, %xmm6",
|
"paddw %xmm7, %xmm6",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13),
|
Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13, None),
|
||||||
"66450FFEEC",
|
"66450FFEEC",
|
||||||
"paddd %xmm12, %xmm13",
|
"paddd %xmm12, %xmm13",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8),
|
Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8, None),
|
||||||
"66440FD4C1",
|
"66440FD4C1",
|
||||||
"paddq %xmm1, %xmm8",
|
"paddq %xmm1, %xmm8",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5),
|
Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5, None),
|
||||||
"66410FECE9",
|
"66410FECE9",
|
||||||
"paddsb %xmm9, %xmm5",
|
"paddsb %xmm9, %xmm5",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6),
|
Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6, None),
|
||||||
"660FEDF7",
|
"660FEDF7",
|
||||||
"paddsw %xmm7, %xmm6",
|
"paddsw %xmm7, %xmm6",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13),
|
Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13, None),
|
||||||
"66450FDCEC",
|
"66450FDCEC",
|
||||||
"paddusb %xmm12, %xmm13",
|
"paddusb %xmm12, %xmm13",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8),
|
Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8, None),
|
||||||
"66440FDDC1",
|
"66440FDDC1",
|
||||||
"paddusw %xmm1, %xmm8",
|
"paddusw %xmm1, %xmm8",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5),
|
Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5, None),
|
||||||
"66410FE8E9",
|
"66410FE8E9",
|
||||||
"psubsb %xmm9, %xmm5",
|
"psubsb %xmm9, %xmm5",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6),
|
Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6, None),
|
||||||
"660FE9F7",
|
"660FE9F7",
|
||||||
"psubsw %xmm7, %xmm6",
|
"psubsw %xmm7, %xmm6",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13),
|
Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13, None),
|
||||||
"66450FD8EC",
|
"66450FD8EC",
|
||||||
"psubusb %xmm12, %xmm13",
|
"psubusb %xmm12, %xmm13",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8),
|
Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8, None),
|
||||||
"66440FD9C1",
|
"66440FD9C1",
|
||||||
"psubusw %xmm1, %xmm8",
|
"psubusw %xmm1, %xmm8",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13),
|
Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13, None),
|
||||||
"66450FE0EC",
|
"66450FE0EC",
|
||||||
"pavgb %xmm12, %xmm13",
|
"pavgb %xmm12, %xmm13",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8),
|
Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8, None),
|
||||||
"66440FE3C1",
|
"66440FE3C1",
|
||||||
"pavgw %xmm1, %xmm8",
|
"pavgw %xmm1, %xmm8",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9),
|
Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9, None),
|
||||||
"66440FF8CD",
|
"66440FF8CD",
|
||||||
"psubb %xmm5, %xmm9",
|
"psubb %xmm5, %xmm9",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7),
|
Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7, None),
|
||||||
"660FF9FE",
|
"660FF9FE",
|
||||||
"psubw %xmm6, %xmm7",
|
"psubw %xmm6, %xmm7",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12),
|
Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12, None),
|
||||||
"66450FFAE5",
|
"66450FFAE5",
|
||||||
"psubd %xmm13, %xmm12",
|
"psubd %xmm13, %xmm12",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1),
|
Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1, None),
|
||||||
"66410FFBC8",
|
"66410FFBC8",
|
||||||
"psubq %xmm8, %xmm1",
|
"psubq %xmm8, %xmm1",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6),
|
Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6, None),
|
||||||
"66410F3840F7",
|
"66410F3840F7",
|
||||||
"pmulld %xmm15, %xmm6",
|
"pmulld %xmm15, %xmm6",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1),
|
Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1, None),
|
||||||
"66410FD5CE",
|
"66410FD5CE",
|
||||||
"pmullw %xmm14, %xmm1",
|
"pmullw %xmm14, %xmm1",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
|
Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9, None),
|
||||||
"66450FF4C8",
|
"66450FF4C8",
|
||||||
"pmuludq %xmm8, %xmm9",
|
"pmuludq %xmm8, %xmm9",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6),
|
Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6, None),
|
||||||
"66410F383CF7",
|
"66410F383CF7",
|
||||||
"pmaxsb %xmm15, %xmm6",
|
"pmaxsb %xmm15, %xmm6",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6),
|
Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6, None),
|
||||||
"66410FEEF7",
|
"66410FEEF7",
|
||||||
"pmaxsw %xmm15, %xmm6",
|
"pmaxsw %xmm15, %xmm6",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6),
|
Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6, None),
|
||||||
"66410F383DF7",
|
"66410F383DF7",
|
||||||
"pmaxsd %xmm15, %xmm6",
|
"pmaxsd %xmm15, %xmm6",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1),
|
Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1, None),
|
||||||
"66410FDECE",
|
"66410FDECE",
|
||||||
"pmaxub %xmm14, %xmm1",
|
"pmaxub %xmm14, %xmm1",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1),
|
Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1, None),
|
||||||
"66410F383ECE",
|
"66410F383ECE",
|
||||||
"pmaxuw %xmm14, %xmm1",
|
"pmaxuw %xmm14, %xmm1",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1),
|
Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1, None),
|
||||||
"66410F383FCE",
|
"66410F383FCE",
|
||||||
"pmaxud %xmm14, %xmm1",
|
"pmaxud %xmm14, %xmm1",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9),
|
Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9, None),
|
||||||
"66450F3838C8",
|
"66450F3838C8",
|
||||||
"pminsb %xmm8, %xmm9",
|
"pminsb %xmm8, %xmm9",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9),
|
Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9, None),
|
||||||
"66450FEAC8",
|
"66450FEAC8",
|
||||||
"pminsw %xmm8, %xmm9",
|
"pminsw %xmm8, %xmm9",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9),
|
Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9, None),
|
||||||
"66450F3839C8",
|
"66450F3839C8",
|
||||||
"pminsd %xmm8, %xmm9",
|
"pminsd %xmm8, %xmm9",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2),
|
Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2, None),
|
||||||
"660FDAD3",
|
"660FDAD3",
|
||||||
"pminub %xmm3, %xmm2",
|
"pminub %xmm3, %xmm2",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2),
|
Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2, None),
|
||||||
"660F383AD3",
|
"660F383AD3",
|
||||||
"pminuw %xmm3, %xmm2",
|
"pminuw %xmm3, %xmm2",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2),
|
Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2, None),
|
||||||
"660F383BD3",
|
"660F383BD3",
|
||||||
"pminud %xmm3, %xmm2",
|
"pminud %xmm3, %xmm2",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2),
|
Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2, None),
|
||||||
"66410FEFD3",
|
"66410FEFD3",
|
||||||
"pxor %xmm11, %xmm2",
|
"pxor %xmm11, %xmm2",
|
||||||
));
|
));
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2),
|
Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2, None),
|
||||||
"66410F3800D3",
|
"66410F3800D3",
|
||||||
"pshufb %xmm11, %xmm2",
|
"pshufb %xmm11, %xmm2",
|
||||||
));
|
));
|
||||||
@@ -3496,12 +3498,12 @@ fn test_x64_emit() {
|
|||||||
// ========================================================
|
// ========================================================
|
||||||
// XmmRmRImm
|
// XmmRmRImm
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false),
|
Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false, None),
|
||||||
"660FC2CD02",
|
"660FC2CD02",
|
||||||
"cmppd $2, %xmm5, %xmm1",
|
"cmppd $2, %xmm5, %xmm1",
|
||||||
));
|
));
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false),
|
Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false, None),
|
||||||
"410FC2FF00",
|
"410FC2FF00",
|
||||||
"cmpps $0, %xmm15, %xmm7",
|
"cmpps $0, %xmm15, %xmm7",
|
||||||
));
|
));
|
||||||
|
|||||||
@@ -213,6 +213,7 @@ pub enum Inst {
|
|||||||
op: SseOpcode,
|
op: SseOpcode,
|
||||||
src: RegMem,
|
src: RegMem,
|
||||||
dst: Writable<Reg>,
|
dst: Writable<Reg>,
|
||||||
|
srcloc: Option<SourceLoc>,
|
||||||
},
|
},
|
||||||
|
|
||||||
/// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
|
/// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
|
||||||
@@ -339,6 +340,7 @@ pub enum Inst {
|
|||||||
dst: Writable<Reg>,
|
dst: Writable<Reg>,
|
||||||
imm: u8,
|
imm: u8,
|
||||||
is64: bool,
|
is64: bool,
|
||||||
|
srcloc: Option<SourceLoc>,
|
||||||
},
|
},
|
||||||
|
|
||||||
// =====================================
|
// =====================================
|
||||||
@@ -712,10 +714,20 @@ impl Inst {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
|
pub(crate) fn xmm_rm_r(
|
||||||
|
op: SseOpcode,
|
||||||
|
src: RegMem,
|
||||||
|
dst: Writable<Reg>,
|
||||||
|
srcloc: Option<SourceLoc>,
|
||||||
|
) -> Self {
|
||||||
src.assert_regclass_is(RegClass::V128);
|
src.assert_regclass_is(RegClass::V128);
|
||||||
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
|
debug_assert!(dst.to_reg().get_class() == RegClass::V128);
|
||||||
Inst::XmmRmR { op, src, dst }
|
Inst::XmmRmR {
|
||||||
|
op,
|
||||||
|
src,
|
||||||
|
dst,
|
||||||
|
srcloc,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
|
pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
|
||||||
@@ -870,6 +882,7 @@ impl Inst {
|
|||||||
dst: Writable<Reg>,
|
dst: Writable<Reg>,
|
||||||
imm: u8,
|
imm: u8,
|
||||||
is64: bool,
|
is64: bool,
|
||||||
|
srcloc: Option<SourceLoc>,
|
||||||
) -> Inst {
|
) -> Inst {
|
||||||
Inst::XmmRmRImm {
|
Inst::XmmRmRImm {
|
||||||
op,
|
op,
|
||||||
@@ -877,6 +890,7 @@ impl Inst {
|
|||||||
dst,
|
dst,
|
||||||
imm,
|
imm,
|
||||||
is64,
|
is64,
|
||||||
|
srcloc,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1234,16 +1248,26 @@ impl Inst {
|
|||||||
/// Choose which instruction to use for comparing two values for equality.
|
/// Choose which instruction to use for comparing two values for equality.
|
||||||
pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
||||||
match ty {
|
match ty {
|
||||||
types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to),
|
types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to, None),
|
||||||
types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to),
|
types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to, None),
|
||||||
types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to),
|
types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to, None),
|
||||||
types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to),
|
types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to, None),
|
||||||
types::F32X4 => {
|
types::F32X4 => Inst::xmm_rm_r_imm(
|
||||||
Inst::xmm_rm_r_imm(SseOpcode::Cmpps, from, to, FcmpImm::Equal.encode(), false)
|
SseOpcode::Cmpps,
|
||||||
}
|
from,
|
||||||
types::F64X2 => {
|
to,
|
||||||
Inst::xmm_rm_r_imm(SseOpcode::Cmppd, from, to, FcmpImm::Equal.encode(), false)
|
FcmpImm::Equal.encode(),
|
||||||
}
|
false,
|
||||||
|
None,
|
||||||
|
),
|
||||||
|
types::F64X2 => Inst::xmm_rm_r_imm(
|
||||||
|
SseOpcode::Cmppd,
|
||||||
|
from,
|
||||||
|
to,
|
||||||
|
FcmpImm::Equal.encode(),
|
||||||
|
false,
|
||||||
|
None,
|
||||||
|
),
|
||||||
_ => unimplemented!("unimplemented type for Inst::equals: {}", ty),
|
_ => unimplemented!("unimplemented type for Inst::equals: {}", ty),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1251,9 +1275,11 @@ impl Inst {
|
|||||||
/// Choose which instruction to use for computing a bitwise AND on two values.
|
/// Choose which instruction to use for computing a bitwise AND on two values.
|
||||||
pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
||||||
match ty {
|
match ty {
|
||||||
types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to),
|
types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to, None),
|
||||||
types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to),
|
types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to, None),
|
||||||
_ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pand, from, to),
|
_ if ty.is_vector() && ty.bits() == 128 => {
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pand, from, to, None)
|
||||||
|
}
|
||||||
_ => unimplemented!("unimplemented type for Inst::and: {}", ty),
|
_ => unimplemented!("unimplemented type for Inst::and: {}", ty),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1261,9 +1287,11 @@ impl Inst {
|
|||||||
/// Choose which instruction to use for computing a bitwise AND NOT on two values.
|
/// Choose which instruction to use for computing a bitwise AND NOT on two values.
|
||||||
pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
||||||
match ty {
|
match ty {
|
||||||
types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to),
|
types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to, None),
|
||||||
types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to),
|
types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to, None),
|
||||||
_ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pandn, from, to),
|
_ if ty.is_vector() && ty.bits() == 128 => {
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pandn, from, to, None)
|
||||||
|
}
|
||||||
_ => unimplemented!("unimplemented type for Inst::and_not: {}", ty),
|
_ => unimplemented!("unimplemented type for Inst::and_not: {}", ty),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1271,9 +1299,11 @@ impl Inst {
|
|||||||
/// Choose which instruction to use for computing a bitwise OR on two values.
|
/// Choose which instruction to use for computing a bitwise OR on two values.
|
||||||
pub(crate) fn or(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
pub(crate) fn or(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
||||||
match ty {
|
match ty {
|
||||||
types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to),
|
types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to, None),
|
||||||
types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to),
|
types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to, None),
|
||||||
_ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Por, from, to),
|
_ if ty.is_vector() && ty.bits() == 128 => {
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Por, from, to, None)
|
||||||
|
}
|
||||||
_ => unimplemented!("unimplemented type for Inst::or: {}", ty),
|
_ => unimplemented!("unimplemented type for Inst::or: {}", ty),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1281,9 +1311,11 @@ impl Inst {
|
|||||||
/// Choose which instruction to use for computing a bitwise XOR on two values.
|
/// Choose which instruction to use for computing a bitwise XOR on two values.
|
||||||
pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
|
||||||
match ty {
|
match ty {
|
||||||
types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to),
|
types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to, None),
|
||||||
types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to),
|
types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to, None),
|
||||||
_ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to),
|
_ if ty.is_vector() && ty.bits() == 128 => {
|
||||||
|
Inst::xmm_rm_r(SseOpcode::Pxor, from, to, None)
|
||||||
|
}
|
||||||
_ => unimplemented!("unimplemented type for Inst::xor: {}", ty),
|
_ => unimplemented!("unimplemented type for Inst::xor: {}", ty),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1430,7 +1462,7 @@ impl PrettyPrint for Inst {
|
|||||||
dst.show_rru(mb_rru),
|
dst.show_rru(mb_rru),
|
||||||
),
|
),
|
||||||
|
|
||||||
Inst::XmmRmR { op, src, dst } => format!(
|
Inst::XmmRmR { op, src, dst, .. } => format!(
|
||||||
"{} {}, {}",
|
"{} {}, {}",
|
||||||
ljustify(op.to_string()),
|
ljustify(op.to_string()),
|
||||||
src.show_rru_sized(mb_rru, 8),
|
src.show_rru_sized(mb_rru, 8),
|
||||||
@@ -1460,7 +1492,7 @@ impl PrettyPrint for Inst {
|
|||||||
show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8),
|
show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8),
|
||||||
),
|
),
|
||||||
|
|
||||||
Inst::XmmRmRImm { op, src, dst, imm, is64 } => format!(
|
Inst::XmmRmRImm { op, src, dst, imm, is64, .. } => format!(
|
||||||
"{} ${}, {}, {}",
|
"{} ${}, {}, {}",
|
||||||
ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })),
|
ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })),
|
||||||
imm,
|
imm,
|
||||||
@@ -2596,6 +2628,7 @@ impl MachInst for Inst {
|
|||||||
SseOpcode::Xorps,
|
SseOpcode::Xorps,
|
||||||
RegMem::reg(to_reg.to_reg()),
|
RegMem::reg(to_reg.to_reg()),
|
||||||
to_reg,
|
to_reg,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
} else {
|
} else {
|
||||||
let tmp = alloc_tmp(RegClass::I64, types::I32);
|
let tmp = alloc_tmp(RegClass::I64, types::I32);
|
||||||
@@ -2614,6 +2647,7 @@ impl MachInst for Inst {
|
|||||||
SseOpcode::Xorpd,
|
SseOpcode::Xorpd,
|
||||||
RegMem::reg(to_reg.to_reg()),
|
RegMem::reg(to_reg.to_reg()),
|
||||||
to_reg,
|
to_reg,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
} else {
|
} else {
|
||||||
let tmp = alloc_tmp(RegClass::I64, types::I64);
|
let tmp = alloc_tmp(RegClass::I64, types::I64);
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
use crate::data_value::DataValue;
|
use crate::data_value::DataValue;
|
||||||
use crate::ir::{
|
use crate::ir::{
|
||||||
condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
|
condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
|
||||||
Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type,
|
Inst as IRInst, InstructionData, LibCall, Opcode, Signature, SourceLoc, Type,
|
||||||
};
|
};
|
||||||
use crate::isa::x64::abi::*;
|
use crate::isa::x64::abi::*;
|
||||||
use crate::isa::x64::inst::args::*;
|
use crate::isa::x64::inst::args::*;
|
||||||
@@ -227,6 +227,7 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
|
|||||||
dst: Writable<Reg>,
|
dst: Writable<Reg>,
|
||||||
lane: u8,
|
lane: u8,
|
||||||
ty: Type,
|
ty: Type,
|
||||||
|
srcloc: Option<SourceLoc>,
|
||||||
) {
|
) {
|
||||||
if !ty.is_float() {
|
if !ty.is_float() {
|
||||||
let (sse_op, is64) = match ty.lane_bits() {
|
let (sse_op, is64) = match ty.lane_bits() {
|
||||||
@@ -236,13 +237,13 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
|
|||||||
64 => (SseOpcode::Pinsrd, true),
|
64 => (SseOpcode::Pinsrd, true),
|
||||||
_ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
|
_ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
|
||||||
};
|
};
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64));
|
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64, srcloc));
|
||||||
} else if ty == types::F32 {
|
} else if ty == types::F32 {
|
||||||
let sse_op = SseOpcode::Insertps;
|
let sse_op = SseOpcode::Insertps;
|
||||||
// Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
|
// Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
|
||||||
// shifted into bits 5:6).
|
// shifted into bits 5:6).
|
||||||
let lane = 0b00_00_00_00 | lane << 4;
|
let lane = 0b00_00_00_00 | lane << 4;
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false));
|
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false, srcloc));
|
||||||
} else if ty == types::F64 {
|
} else if ty == types::F64 {
|
||||||
let sse_op = match lane {
|
let sse_op = match lane {
|
||||||
// Move the lowest quadword in replacement to vector without changing
|
// Move the lowest quadword in replacement to vector without changing
|
||||||
@@ -256,7 +257,7 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
|
|||||||
// Here we use the `xmm_rm_r` encoding because it correctly tells the register
|
// Here we use the `xmm_rm_r` encoding because it correctly tells the register
|
||||||
// allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
|
// allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
|
||||||
// encoding formats like `xmm_unary_rm_r` treat it as a `def`.
|
// encoding formats like `xmm_unary_rm_r` treat it as a `def`.
|
||||||
ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
|
ctx.emit(Inst::xmm_rm_r(sse_op, src, dst, srcloc));
|
||||||
} else {
|
} else {
|
||||||
panic!("unable to emit insertlane for type: {}", ty)
|
panic!("unable to emit insertlane for type: {}", ty)
|
||||||
}
|
}
|
||||||
@@ -694,6 +695,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
SseOpcode::Pmuludq,
|
SseOpcode::Pmuludq,
|
||||||
RegMem::reg(lhs.clone()),
|
RegMem::reg(lhs.clone()),
|
||||||
rhs_1,
|
rhs_1,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
|
|
||||||
// B' = B
|
// B' = B
|
||||||
@@ -707,7 +709,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
RegMemImm::imm(32),
|
RegMemImm::imm(32),
|
||||||
lhs_1,
|
lhs_1,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Pmuludq,
|
||||||
|
RegMem::reg(rhs),
|
||||||
|
lhs_1,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
|
||||||
// B' = B' + A'
|
// B' = B' + A'
|
||||||
// B' = B' << 32
|
// B' = B' << 32
|
||||||
@@ -715,6 +722,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
SseOpcode::Paddq,
|
SseOpcode::Paddq,
|
||||||
RegMem::reg(rhs_1.to_reg()),
|
RegMem::reg(rhs_1.to_reg()),
|
||||||
lhs_1,
|
lhs_1,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::xmm_rmi_reg(
|
ctx.emit(Inst::xmm_rmi_reg(
|
||||||
SseOpcode::Psllq,
|
SseOpcode::Psllq,
|
||||||
@@ -731,11 +739,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
SseOpcode::Pmuludq,
|
SseOpcode::Pmuludq,
|
||||||
RegMem::reg(lhs.clone()),
|
RegMem::reg(lhs.clone()),
|
||||||
rhs_1,
|
rhs_1,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
SseOpcode::Paddq,
|
SseOpcode::Paddq,
|
||||||
RegMem::reg(lhs_1.to_reg()),
|
RegMem::reg(lhs_1.to_reg()),
|
||||||
rhs_1,
|
rhs_1,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
|
ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
|
||||||
return Ok(());
|
return Ok(());
|
||||||
@@ -770,7 +780,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
|
|
||||||
// Move the `lhs` to the same register as `dst`.
|
// Move the `lhs` to the same register as `dst`.
|
||||||
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
||||||
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
|
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
|
||||||
} else {
|
} else {
|
||||||
let is_64 = ty == types::I64;
|
let is_64 = ty == types::I64;
|
||||||
let alu_op = match op {
|
let alu_op = match op {
|
||||||
@@ -828,7 +838,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// Note the flipping of operands: the `rhs` operand is used as the destination instead
|
// Note the flipping of operands: the `rhs` operand is used as the destination instead
|
||||||
// of the `lhs` as in the other bit operations above (e.g. `band`).
|
// of the `lhs` as in the other bit operations above (e.g. `band`).
|
||||||
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
||||||
ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst));
|
ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst, None));
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Iabs => {
|
Opcode::Iabs => {
|
||||||
@@ -884,7 +894,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
|
|
||||||
// Move the `lhs` to the same register as `dst`.
|
// Move the `lhs` to the same register as `dst`.
|
||||||
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
||||||
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
|
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
|
||||||
} else {
|
} else {
|
||||||
panic!("Unsupported type for {} instruction: {}", op, ty);
|
panic!("Unsupported type for {} instruction: {}", op, ty);
|
||||||
}
|
}
|
||||||
@@ -1007,8 +1017,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
SseOpcode::Pxor,
|
SseOpcode::Pxor,
|
||||||
RegMem::reg(tmp.to_reg()),
|
RegMem::reg(tmp.to_reg()),
|
||||||
tmp,
|
tmp,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
|
ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp, None));
|
||||||
ctx.emit(Inst::xmm_unary_rm_r(
|
ctx.emit(Inst::xmm_unary_rm_r(
|
||||||
SseOpcode::Movapd,
|
SseOpcode::Movapd,
|
||||||
RegMem::reg(tmp.to_reg()),
|
RegMem::reg(tmp.to_reg()),
|
||||||
@@ -1561,34 +1572,44 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
};
|
};
|
||||||
|
|
||||||
match condcode {
|
match condcode {
|
||||||
IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
|
IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None)),
|
||||||
IntCC::NotEqual => {
|
IntCC::NotEqual => {
|
||||||
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
|
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None));
|
||||||
// Emit all 1s into the `tmp` register.
|
// Emit all 1s into the `tmp` register.
|
||||||
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
||||||
ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
|
ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None));
|
||||||
// Invert the result of the `PCMPEQ*`.
|
// Invert the result of the `PCMPEQ*`.
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Pxor,
|
||||||
|
RegMem::from(tmp),
|
||||||
|
dst,
|
||||||
|
None,
|
||||||
|
));
|
||||||
}
|
}
|
||||||
IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
|
IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
|
||||||
ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
|
ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst, None))
|
||||||
}
|
}
|
||||||
IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => {
|
IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => {
|
||||||
ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
|
ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst, None));
|
||||||
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
|
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None))
|
||||||
}
|
}
|
||||||
IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
|
IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
|
||||||
ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
|
ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst, None));
|
||||||
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
|
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None));
|
||||||
// Emit all 1s into the `tmp` register.
|
// Emit all 1s into the `tmp` register.
|
||||||
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
||||||
ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
|
ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None));
|
||||||
// Invert the result of the `PCMPEQ*`.
|
// Invert the result of the `PCMPEQ*`.
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Pxor,
|
||||||
|
RegMem::from(tmp),
|
||||||
|
dst,
|
||||||
|
None,
|
||||||
|
));
|
||||||
}
|
}
|
||||||
IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
|
IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
|
||||||
ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
|
ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst, None));
|
||||||
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
|
ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None))
|
||||||
}
|
}
|
||||||
_ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
|
_ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
|
||||||
}
|
}
|
||||||
@@ -1686,7 +1707,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(Inst::gen_move(dst, lhs, input_ty));
|
ctx.emit(Inst::gen_move(dst, lhs, input_ty));
|
||||||
|
|
||||||
// Emit the comparison.
|
// Emit the comparison.
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false));
|
ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false, None));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1899,7 +1920,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ty
|
ty
|
||||||
),
|
),
|
||||||
};
|
};
|
||||||
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
|
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Fmin | Opcode::Fmax => {
|
Opcode::Fmin | Opcode::Fmax => {
|
||||||
@@ -1988,15 +2009,15 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
|
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
|
||||||
|
|
||||||
// Perform min in reverse direction
|
// Perform min in reverse direction
|
||||||
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
|
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1, None));
|
||||||
|
|
||||||
// Perform min in original direction
|
// Perform min in original direction
|
||||||
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
|
ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst, None));
|
||||||
|
|
||||||
// X64 handles propagation of -0's and Nans differently between left and right
|
// X64 handles propagation of -0's and Nans differently between left and right
|
||||||
// operands. After doing the min in both directions, this OR will
|
// operands. After doing the min in both directions, this OR will
|
||||||
// guarrentee capture of -0's and Nan in our tmp register
|
// guarrentee capture of -0's and Nan in our tmp register
|
||||||
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
|
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1, None));
|
||||||
|
|
||||||
// Compare unordered to create mask for lanes containing NaNs and then use
|
// Compare unordered to create mask for lanes containing NaNs and then use
|
||||||
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
|
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
|
||||||
@@ -2009,8 +2030,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
dst,
|
dst,
|
||||||
cond.encode(),
|
cond.encode(),
|
||||||
false,
|
false,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
or_op,
|
||||||
|
RegMem::reg(dst.to_reg()),
|
||||||
|
tmp_xmm1,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
|
|
||||||
|
|
||||||
// The dst register holds a mask for lanes containing NaNs.
|
// The dst register holds a mask for lanes containing NaNs.
|
||||||
// We take that mask and shift in preparation for creating a different mask
|
// We take that mask and shift in preparation for creating a different mask
|
||||||
@@ -2022,7 +2049,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
|
|
||||||
// Finally we do a nand with the tmp register to produce the final results
|
// Finally we do a nand with the tmp register to produce the final results
|
||||||
// in the dst.
|
// in the dst.
|
||||||
ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
andn_op,
|
||||||
|
RegMem::reg(tmp_xmm1.to_reg()),
|
||||||
|
dst,
|
||||||
|
None,
|
||||||
|
));
|
||||||
} else {
|
} else {
|
||||||
let (
|
let (
|
||||||
mov_op,
|
mov_op,
|
||||||
@@ -2065,23 +2097,43 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
|
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
|
||||||
|
|
||||||
// Perform max in reverse direction.
|
// Perform max in reverse direction.
|
||||||
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
max_op,
|
||||||
|
RegMem::reg(dst.to_reg()),
|
||||||
|
tmp_xmm1,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
|
||||||
// Perform max in original direction.
|
// Perform max in original direction.
|
||||||
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
|
ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst, None));
|
||||||
|
|
||||||
// Get the difference between the two results and store in tmp.
|
// Get the difference between the two results and store in tmp.
|
||||||
// Max uses a different approach than min to account for potential
|
// Max uses a different approach than min to account for potential
|
||||||
// discrepancies with plus/minus 0.
|
// discrepancies with plus/minus 0.
|
||||||
ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
xor_op,
|
||||||
|
RegMem::reg(tmp_xmm1.to_reg()),
|
||||||
|
dst,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
|
||||||
// X64 handles propagation of -0's and Nans differently between left and right
|
// X64 handles propagation of -0's and Nans differently between left and right
|
||||||
// operands. After doing the max in both directions, this OR will
|
// operands. After doing the max in both directions, this OR will
|
||||||
// guarentee capture of 0's and Nan in our tmp register.
|
// guarentee capture of 0's and Nan in our tmp register.
|
||||||
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
or_op,
|
||||||
|
RegMem::reg(dst.to_reg()),
|
||||||
|
tmp_xmm1,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
|
||||||
// Capture NaNs and sign discrepancies.
|
// Capture NaNs and sign discrepancies.
|
||||||
ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
sub_op,
|
||||||
|
RegMem::reg(dst.to_reg()),
|
||||||
|
tmp_xmm1,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
|
||||||
// Compare unordered to create mask for lanes containing NaNs and then use
|
// Compare unordered to create mask for lanes containing NaNs and then use
|
||||||
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
|
// that mask to saturate the NaN containing lanes in the tmp register with 1s.
|
||||||
@@ -2092,6 +2144,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
dst,
|
dst,
|
||||||
cond.encode(),
|
cond.encode(),
|
||||||
false,
|
false,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
|
|
||||||
// The dst register holds a mask for lanes containing NaNs.
|
// The dst register holds a mask for lanes containing NaNs.
|
||||||
@@ -2104,7 +2157,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
|
|
||||||
// Finally we do a nand with the tmp register to produce the final results
|
// Finally we do a nand with the tmp register to produce the final results
|
||||||
// in the dst.
|
// in the dst.
|
||||||
ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
andn_op,
|
||||||
|
RegMem::reg(tmp_xmm1.to_reg()),
|
||||||
|
dst,
|
||||||
|
None,
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2340,7 +2398,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(inst);
|
ctx.emit(inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
|
ctx.emit(Inst::xmm_rm_r(opcode, src, dst, None));
|
||||||
} else {
|
} else {
|
||||||
// Eventually vector constants should be available in `gen_constant` and this block
|
// Eventually vector constants should be available in `gen_constant` and this block
|
||||||
// can be merged with the one above (TODO).
|
// can be merged with the one above (TODO).
|
||||||
@@ -2361,6 +2419,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
tmp,
|
tmp,
|
||||||
cond.encode(),
|
cond.encode(),
|
||||||
false,
|
false,
|
||||||
|
None,
|
||||||
);
|
);
|
||||||
ctx.emit(cmpps);
|
ctx.emit(cmpps);
|
||||||
|
|
||||||
@@ -2380,7 +2439,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(shift);
|
ctx.emit(shift);
|
||||||
|
|
||||||
// Apply shifted mask (XOR or AND).
|
// Apply shifted mask (XOR or AND).
|
||||||
let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
|
let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst, None);
|
||||||
ctx.emit(mask);
|
ctx.emit(mask);
|
||||||
} else {
|
} else {
|
||||||
panic!("unexpected type {:?} for Fabs", output_ty);
|
panic!("unexpected type {:?} for Fabs", output_ty);
|
||||||
@@ -2439,14 +2498,20 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
dst,
|
dst,
|
||||||
None,
|
None,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
|
ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst, None));
|
||||||
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2, None));
|
ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2, None));
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
and_op,
|
and_op,
|
||||||
RegMem::reg(tmp_xmm1.to_reg()),
|
RegMem::reg(tmp_xmm1.to_reg()),
|
||||||
tmp_xmm2,
|
tmp_xmm2,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
or_op,
|
||||||
|
RegMem::reg(tmp_xmm2.to_reg()),
|
||||||
|
dst,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
|
Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
|
||||||
@@ -3167,7 +3232,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
// After loading the constructed mask in a temporary register, we use this to
|
// After loading the constructed mask in a temporary register, we use this to
|
||||||
// shuffle the `dst` register (remember that, in this case, it is the same as
|
// shuffle the `dst` register (remember that, in this case, it is the same as
|
||||||
// `src` so we disregard this register).
|
// `src` so we disregard this register).
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Pshufb,
|
||||||
|
RegMem::from(tmp),
|
||||||
|
dst,
|
||||||
|
None,
|
||||||
|
));
|
||||||
} else {
|
} else {
|
||||||
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
|
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
|
||||||
// them together. This is necessary due to PSHUFB semantics. As in the case above,
|
// them together. This is necessary due to PSHUFB semantics. As in the case above,
|
||||||
@@ -3179,7 +3249,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
||||||
let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
|
let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
|
||||||
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp1, ty));
|
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp1, ty));
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Pshufb,
|
||||||
|
RegMem::from(tmp1),
|
||||||
|
tmp0,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
|
||||||
// PSHUFB the second argument, placing zeroes for unused lanes.
|
// PSHUFB the second argument, placing zeroes for unused lanes.
|
||||||
let constructed_mask = mask
|
let constructed_mask = mask
|
||||||
@@ -3189,11 +3264,21 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
.collect();
|
.collect();
|
||||||
let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
|
let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
|
||||||
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp2, ty));
|
ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp2, ty));
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Pshufb,
|
||||||
|
RegMem::from(tmp2),
|
||||||
|
dst,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
|
||||||
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
|
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
|
||||||
// is not important).
|
// is not important).
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Orps,
|
||||||
|
RegMem::from(tmp0),
|
||||||
|
dst,
|
||||||
|
None,
|
||||||
|
));
|
||||||
|
|
||||||
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
|
// TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
|
||||||
}
|
}
|
||||||
@@ -3227,6 +3312,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
SseOpcode::Paddusb,
|
SseOpcode::Paddusb,
|
||||||
RegMem::from(zero_mask),
|
RegMem::from(zero_mask),
|
||||||
swizzle_mask,
|
swizzle_mask,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
|
|
||||||
// Shuffle `dst` using the fixed-up `swizzle_mask`.
|
// Shuffle `dst` using the fixed-up `swizzle_mask`.
|
||||||
@@ -3234,6 +3320,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
SseOpcode::Pshufb,
|
SseOpcode::Pshufb,
|
||||||
RegMem::from(swizzle_mask),
|
RegMem::from(swizzle_mask),
|
||||||
dst,
|
dst,
|
||||||
|
None,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3253,7 +3340,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
debug_assert!(lane < ty.lane_count() as u8);
|
debug_assert!(lane < ty.lane_count() as u8);
|
||||||
|
|
||||||
ctx.emit(Inst::gen_move(dst, in_vec, ty));
|
ctx.emit(Inst::gen_move(dst, in_vec, ty));
|
||||||
emit_insert_lane(ctx, src, dst, lane, ty.lane_type());
|
emit_insert_lane(ctx, src, dst, lane, ty.lane_type(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Extractlane => {
|
Opcode::Extractlane => {
|
||||||
@@ -3279,7 +3366,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
_ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
|
_ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
|
||||||
};
|
};
|
||||||
let src = RegMem::reg(src);
|
let src = RegMem::reg(src);
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit));
|
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit, None));
|
||||||
} else {
|
} else {
|
||||||
if lane == 0 {
|
if lane == 0 {
|
||||||
// Remove the extractlane instruction, leaving the float where it is. The upper
|
// Remove the extractlane instruction, leaving the float where it is. The upper
|
||||||
@@ -3301,35 +3388,57 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
let src = RegMem::reg(src);
|
let src = RegMem::reg(src);
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
|
ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false, None));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Splat => {
|
Opcode::Splat | Opcode::LoadSplat => {
|
||||||
let ty = ty.unwrap();
|
let ty = ty.unwrap();
|
||||||
assert_eq!(ty.bits(), 128);
|
assert_eq!(ty.bits(), 128);
|
||||||
let src_ty = ctx.input_ty(insn, 0);
|
let src_ty = ctx.input_ty(insn, 0);
|
||||||
assert!(src_ty.bits() < 128);
|
assert!(src_ty.bits() < 128);
|
||||||
let src = input_to_reg_mem(ctx, inputs[0]);
|
|
||||||
|
let (src, srcloc) = match op {
|
||||||
|
Opcode::Splat => (input_to_reg_mem(ctx, inputs[0]), None),
|
||||||
|
Opcode::LoadSplat => {
|
||||||
|
let offset = ctx.data(insn).load_store_offset().unwrap();
|
||||||
|
let amode = lower_to_amode(ctx, inputs[0], offset);
|
||||||
|
(RegMem::mem(amode), Some(ctx.srcloc(insn)))
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
let dst = get_output_reg(ctx, outputs[0]);
|
let dst = get_output_reg(ctx, outputs[0]);
|
||||||
|
|
||||||
// We know that splat will overwrite all of the lanes of `dst` but it takes several
|
// We know that splat will overwrite all of the lanes of `dst` but it takes several
|
||||||
// instructions to do so. Because of the multiple instructions, there is no good way to
|
// instructions to do so. Because of the multiple instructions, there is no good way to
|
||||||
// declare `dst` a `def` except with the following pseudo-instruction.
|
// declare `dst` a `def` except with the following pseudo-instruction.
|
||||||
ctx.emit(Inst::xmm_uninit_value(dst));
|
ctx.emit(Inst::xmm_uninit_value(dst));
|
||||||
|
|
||||||
|
// TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
|
||||||
|
// and VPBROADCAST*.
|
||||||
match ty.lane_bits() {
|
match ty.lane_bits() {
|
||||||
8 => {
|
8 => {
|
||||||
emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
|
emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc);
|
||||||
// Initialize a register with all 0s.
|
// Initialize a register with all 0s.
|
||||||
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Pxor,
|
||||||
|
RegMem::from(tmp),
|
||||||
|
tmp,
|
||||||
|
srcloc,
|
||||||
|
));
|
||||||
// Shuffle the lowest byte lane to all other lanes.
|
// Shuffle the lowest byte lane to all other lanes.
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Pshufb,
|
||||||
|
RegMem::from(tmp),
|
||||||
|
dst,
|
||||||
|
srcloc,
|
||||||
|
))
|
||||||
}
|
}
|
||||||
16 => {
|
16 => {
|
||||||
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
|
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc);
|
||||||
emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
|
emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc);
|
||||||
// Shuffle the lowest two lanes to all other lanes.
|
// Shuffle the lowest two lanes to all other lanes.
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(
|
ctx.emit(Inst::xmm_rm_r_imm(
|
||||||
SseOpcode::Pshufd,
|
SseOpcode::Pshufd,
|
||||||
@@ -3337,10 +3446,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
dst,
|
dst,
|
||||||
0,
|
0,
|
||||||
false,
|
false,
|
||||||
|
srcloc,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
32 => {
|
32 => {
|
||||||
emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
|
emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc);
|
||||||
// Shuffle the lowest lane to all other lanes.
|
// Shuffle the lowest lane to all other lanes.
|
||||||
ctx.emit(Inst::xmm_rm_r_imm(
|
ctx.emit(Inst::xmm_rm_r_imm(
|
||||||
SseOpcode::Pshufd,
|
SseOpcode::Pshufd,
|
||||||
@@ -3348,11 +3458,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
dst,
|
dst,
|
||||||
0,
|
0,
|
||||||
false,
|
false,
|
||||||
|
srcloc,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
64 => {
|
64 => {
|
||||||
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
|
emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc);
|
||||||
emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
|
emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc);
|
||||||
}
|
}
|
||||||
_ => panic!("Invalid type to splat: {}", ty),
|
_ => panic!("Invalid type to splat: {}", ty),
|
||||||
}
|
}
|
||||||
@@ -3386,9 +3497,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
|
|
||||||
// Initialize a register with all 0s.
|
// Initialize a register with all 0s.
|
||||||
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
let tmp = ctx.alloc_tmp(RegClass::V128, ty);
|
||||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Pxor,
|
||||||
|
RegMem::from(tmp),
|
||||||
|
tmp,
|
||||||
|
None,
|
||||||
|
));
|
||||||
// Compare to see what lanes are filled with all 1s.
|
// Compare to see what lanes are filled with all 1s.
|
||||||
ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
|
ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp, None));
|
||||||
// Set the ZF if the result is all zeroes.
|
// Set the ZF if the result is all zeroes.
|
||||||
ctx.emit(Inst::xmm_cmp_rm_r(
|
ctx.emit(Inst::xmm_cmp_rm_r(
|
||||||
SseOpcode::Ptest,
|
SseOpcode::Ptest,
|
||||||
|
|||||||
@@ -1892,3 +1892,31 @@ fn expand_tls_value(
|
|||||||
unreachable!();
|
unreachable!();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn expand_load_splat(
|
||||||
|
inst: ir::Inst,
|
||||||
|
func: &mut ir::Function,
|
||||||
|
_cfg: &mut ControlFlowGraph,
|
||||||
|
_isa: &dyn TargetIsa,
|
||||||
|
) {
|
||||||
|
let mut pos = FuncCursor::new(func).at_inst(inst);
|
||||||
|
|
||||||
|
pos.use_srcloc(inst);
|
||||||
|
|
||||||
|
let (ptr, offset, flags) = match pos.func.dfg[inst] {
|
||||||
|
ir::InstructionData::Load {
|
||||||
|
opcode: ir::Opcode::LoadSplat,
|
||||||
|
arg,
|
||||||
|
offset,
|
||||||
|
flags,
|
||||||
|
} => (arg, offset, flags),
|
||||||
|
_ => panic!(
|
||||||
|
"Expected load_splat: {}",
|
||||||
|
pos.func.dfg.display_inst(inst, None)
|
||||||
|
),
|
||||||
|
};
|
||||||
|
let ty = pos.func.dfg.ctrl_typevar(inst);
|
||||||
|
let load = pos.ins().load(ty.lane_type(), flags, ptr, offset);
|
||||||
|
|
||||||
|
pos.func.dfg.replace(inst).splat(ty, load);
|
||||||
|
}
|
||||||
|
|||||||
@@ -1414,19 +1414,17 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
| Operator::V128Load16Splat { memarg }
|
| Operator::V128Load16Splat { memarg }
|
||||||
| Operator::V128Load32Splat { memarg }
|
| Operator::V128Load32Splat { memarg }
|
||||||
| Operator::V128Load64Splat { memarg } => {
|
| Operator::V128Load64Splat { memarg } => {
|
||||||
// TODO: For spec compliance, this is initially implemented as a combination of `load +
|
let opcode = ir::Opcode::LoadSplat;
|
||||||
// splat` but could be implemented eventually as a single instruction (`load_splat`).
|
let result_ty = type_of(op);
|
||||||
// See https://github.com/bytecodealliance/wasmtime/issues/1175.
|
let (flags, base, offset) = prepare_load(
|
||||||
translate_load(
|
|
||||||
memarg,
|
memarg,
|
||||||
ir::Opcode::Load,
|
mem_op_size(opcode, result_ty.lane_type()),
|
||||||
type_of(op).lane_type(),
|
|
||||||
builder,
|
builder,
|
||||||
state,
|
state,
|
||||||
environ,
|
environ,
|
||||||
)?;
|
)?;
|
||||||
let splatted = builder.ins().splat(type_of(op), state.pop1());
|
let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
|
||||||
state.push1(splatted)
|
state.push1(dfg.first_result(load))
|
||||||
}
|
}
|
||||||
Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
|
Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
|
||||||
let vector = pop1_with_bitcast(state, type_of(op), builder);
|
let vector = pop1_with_bitcast(state, type_of(op), builder);
|
||||||
@@ -2088,7 +2086,7 @@ fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 {
|
|||||||
ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
|
ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
|
||||||
ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
|
ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
|
||||||
ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
|
ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
|
||||||
ir::Opcode::Store | ir::Opcode::Load => ty.bytes(),
|
ir::Opcode::Store | ir::Opcode::Load | ir::Opcode::LoadSplat => ty.bytes(),
|
||||||
_ => panic!("unknown size of mem op for {:?}", opcode),
|
_ => panic!("unknown size of mem op for {:?}", opcode),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user