x64: Add most remaining AVX lowerings (#5819)

* x64: Add most remaining AVX lowerings

This commit goes through `inst.isle` and adds a corresponding AVX
lowering for most SSE lowerings. I opted to skip instructions where the
SSE lowering didn't read/modify a register, such as `roundps`. I think
that AVX will benefit these instructions when there's load-merging since
AVX doesn't require alignment, but I've deferred that work to a future
PR.

Otherwise though in this PR I think all (or almost all) of the 3-operand
forms of AVX instructions are supported with their SSE counterparts.
This should ideally improve codegen slightly by removing register
pressure and the need for `movdqa` between registers. I've attempted to
ensure that there's at least one codegen test for all the new instructions.

As a side note, the recent capstone integration into `precise-output`
tests helped me catch a number of encoding bugs much earlier than
otherwise, so I've found that incredibly useful in tests!

* Move `vpinsr*` instructions to their own variant

Use true `XmmMem` and `GprMem` types in the instruction as well to get
more type-level safety for what goes where.

* Remove `Inst::produces_const` accessor

Instead of conditionally defining regalloc and various other operations
instead add dedicated `MInst` variants for operations which are intended
to produce a constant to have more clear interactions with regalloc and
printing and such.

* Fix tests

* Register traps in `MachBuffer` for load-folding ops

This adds a missing `add_trap` to encoding of VEX instructions with
memory operands to ensure that if they cause a segfault that there's
appropriate metadata for Wasmtime to understand that the instruction
could in fact trap. This fixes a fuzz test case found locally where v8
trapped and Wasmtime didn't catch the signal and crashed the fuzzer.
This commit is contained in:
Alex Crichton
2023-02-20 09:11:52 -06:00
committed by GitHub
parent ad128b6811
commit c26a65a854
16 changed files with 4145 additions and 466 deletions

View File

@@ -4,6 +4,7 @@
use super::evex::Register;
use super::rex::{LegacyPrefixes, OpcodeMap};
use super::ByteSink;
use crate::ir::TrapCode;
use crate::isa::x64::args::Amode;
use crate::isa::x64::encoding::rex;
use crate::isa::x64::inst::Inst;
@@ -267,6 +268,12 @@ impl VexInstruction {
/// Emit the VEX-encoded instruction to the provided buffer.
pub fn encode(&self, sink: &mut MachBuffer<Inst>) {
if let RegisterOrAmode::Amode(amode) = &self.rm {
if amode.can_trap() {
sink.add_trap(TrapCode::HeapOutOfBounds);
}
}
// 2/3 byte prefix
if self.use_2byte_prefix() {
self.encode_2byte_prefix(sink);

File diff suppressed because it is too large Load Diff

View File

@@ -617,13 +617,6 @@ impl RegMemImm {
}
}
pub(crate) fn to_reg(&self) -> Option<Reg> {
match self {
Self::Reg { reg } => Some(*reg),
_ => None,
}
}
pub(crate) fn with_allocs(&self, allocs: &mut AllocationConsumer<'_>) -> Self {
match self {
Self::Reg { reg } => Self::Reg {
@@ -726,12 +719,6 @@ impl RegMem {
RegMem::Mem { addr, .. } => addr.get_operands(collector),
}
}
pub(crate) fn to_reg(&self) -> Option<Reg> {
match self {
RegMem::Reg { reg } => Some(*reg),
_ => None,
}
}
pub(crate) fn with_allocs(&self, allocs: &mut AllocationConsumer<'_>) -> Self {
match self {
@@ -1510,10 +1497,108 @@ impl AvxOpcode {
| AvxOpcode::Vfmadd213ps
| AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA],
AvxOpcode::Vminps
| AvxOpcode::Vorps
| AvxOpcode::Vminpd
| AvxOpcode::Vmaxps
| AvxOpcode::Vmaxpd
| AvxOpcode::Vandnps
| AvxOpcode::Vandnpd
| AvxOpcode::Vpandn
| AvxOpcode::Vcmpps
| AvxOpcode::Vpsrld => {
| AvxOpcode::Vcmppd
| AvxOpcode::Vpsrlw
| AvxOpcode::Vpsrld
| AvxOpcode::Vpsrlq
| AvxOpcode::Vpaddb
| AvxOpcode::Vpaddw
| AvxOpcode::Vpaddd
| AvxOpcode::Vpaddq
| AvxOpcode::Vpaddsb
| AvxOpcode::Vpaddsw
| AvxOpcode::Vpaddusb
| AvxOpcode::Vpaddusw
| AvxOpcode::Vpsubb
| AvxOpcode::Vpsubw
| AvxOpcode::Vpsubd
| AvxOpcode::Vpsubq
| AvxOpcode::Vpsubsb
| AvxOpcode::Vpsubsw
| AvxOpcode::Vpsubusb
| AvxOpcode::Vpsubusw
| AvxOpcode::Vpavgb
| AvxOpcode::Vpavgw
| AvxOpcode::Vpand
| AvxOpcode::Vandps
| AvxOpcode::Vandpd
| AvxOpcode::Vpor
| AvxOpcode::Vorps
| AvxOpcode::Vorpd
| AvxOpcode::Vpxor
| AvxOpcode::Vxorps
| AvxOpcode::Vxorpd
| AvxOpcode::Vpmullw
| AvxOpcode::Vpmulld
| AvxOpcode::Vpmulhw
| AvxOpcode::Vpmulhd
| AvxOpcode::Vpmulhrsw
| AvxOpcode::Vpmulhuw
| AvxOpcode::Vpmuldq
| AvxOpcode::Vpmuludq
| AvxOpcode::Vpunpckhwd
| AvxOpcode::Vpunpcklwd
| AvxOpcode::Vunpcklps
| AvxOpcode::Vaddps
| AvxOpcode::Vaddpd
| AvxOpcode::Vsubps
| AvxOpcode::Vsubpd
| AvxOpcode::Vmulps
| AvxOpcode::Vmulpd
| AvxOpcode::Vdivps
| AvxOpcode::Vdivpd
| AvxOpcode::Vpcmpeqb
| AvxOpcode::Vpcmpeqw
| AvxOpcode::Vpcmpeqd
| AvxOpcode::Vpcmpeqq
| AvxOpcode::Vpcmpgtb
| AvxOpcode::Vpcmpgtw
| AvxOpcode::Vpcmpgtd
| AvxOpcode::Vpcmpgtq
| AvxOpcode::Vblendvps
| AvxOpcode::Vblendvpd
| AvxOpcode::Vpblendvb
| AvxOpcode::Vmovlhps
| AvxOpcode::Vpminsb
| AvxOpcode::Vpminsw
| AvxOpcode::Vpminsd
| AvxOpcode::Vpminub
| AvxOpcode::Vpminuw
| AvxOpcode::Vpminud
| AvxOpcode::Vpmaxsb
| AvxOpcode::Vpmaxsw
| AvxOpcode::Vpmaxsd
| AvxOpcode::Vpmaxub
| AvxOpcode::Vpmaxuw
| AvxOpcode::Vpmaxud
| AvxOpcode::Vpunpcklbw
| AvxOpcode::Vpunpckhbw
| AvxOpcode::Vpacksswb
| AvxOpcode::Vpackssdw
| AvxOpcode::Vpackuswb
| AvxOpcode::Vpackusdw
| AvxOpcode::Vpalignr
| AvxOpcode::Vpinsrb
| AvxOpcode::Vpinsrw
| AvxOpcode::Vpinsrd
| AvxOpcode::Vpinsrq
| AvxOpcode::Vpmaddwd
| AvxOpcode::Vpmaddubsw
| AvxOpcode::Vinsertps
| AvxOpcode::Vpshufb
| AvxOpcode::Vshufps
| AvxOpcode::Vpsllw
| AvxOpcode::Vpslld
| AvxOpcode::Vpsllq
| AvxOpcode::Vpsraw
| AvxOpcode::Vpsrad => {
smallvec![InstructionSet::AVX]
}
}

View File

@@ -148,16 +148,10 @@ pub(crate) fn emit(
src2,
dst: reg_g,
} => {
let (reg_g, src2) = if inst.produces_const() {
let reg_g = allocs.next(reg_g.to_reg().to_reg());
(reg_g, RegMemImm::reg(reg_g))
} else {
let src1 = allocs.next(src1.to_reg());
let reg_g = allocs.next(reg_g.to_reg().to_reg());
debug_assert_eq!(src1, reg_g);
let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs);
(reg_g, src2)
};
let src1 = allocs.next(src1.to_reg());
let reg_g = allocs.next(reg_g.to_reg().to_reg());
debug_assert_eq!(src1, reg_g);
let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs);
let rex = RexFlags::from(*size);
if *op == AluRmiROpcode::Mul {
@@ -253,6 +247,23 @@ pub(crate) fn emit(
}
}
Inst::AluConstOp { op, size, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
emit(
&Inst::AluRmiR {
size: *size,
op: *op,
dst: Writable::from_reg(Gpr::new(dst).unwrap()),
src1: Gpr::new(dst).unwrap(),
src2: Gpr::new(dst).unwrap().into(),
},
allocs,
sink,
info,
state,
);
}
Inst::AluRM {
size,
src1_dst,
@@ -1837,16 +1848,10 @@ pub(crate) fn emit(
src2: src_e,
dst: reg_g,
} => {
let (src_e, reg_g) = if inst.produces_const() {
let reg_g = allocs.next(reg_g.to_reg().to_reg());
(RegMem::Reg { reg: reg_g }, reg_g)
} else {
let src1 = allocs.next(src1.to_reg());
let reg_g = allocs.next(reg_g.to_reg().to_reg());
let src_e = src_e.clone().to_reg_mem().with_allocs(allocs);
debug_assert_eq!(src1, reg_g);
(src_e, reg_g)
};
let src1 = allocs.next(src1.to_reg());
let reg_g = allocs.next(reg_g.to_reg().to_reg());
let src_e = src_e.clone().to_reg_mem().with_allocs(allocs);
debug_assert_eq!(src1, reg_g);
let rex = RexFlags::clear_w();
let (prefix, opcode, length) = match op {
@@ -1959,6 +1964,22 @@ pub(crate) fn emit(
}
}
Inst::XmmConstOp { op, dst } => {
let dst = allocs.next(dst.to_reg().to_reg());
emit(
&Inst::XmmRmR {
op: *op,
dst: Writable::from_reg(Xmm::new(dst).unwrap()),
src1: Xmm::new(dst).unwrap(),
src2: Xmm::new(dst).unwrap().into(),
},
allocs,
sink,
info,
state,
);
}
Inst::XmmRmRBlend {
op,
src1,
@@ -1998,6 +2019,9 @@ pub(crate) fn emit(
src2,
dst,
} => {
use LegacyPrefixes as LP;
use OpcodeMap as OM;
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs);
@@ -2008,8 +2032,15 @@ pub(crate) fn emit(
// `opcode_ext`, so handle that specially here.
RegMemImm::Imm { simm32 } => {
let (opcode, opcode_ext, prefix) = match op {
AvxOpcode::Vpsrlw => (0x71, 2, LegacyPrefixes::_66),
AvxOpcode::Vpsrld => (0x72, 2, LegacyPrefixes::_66),
_ => panic!("unexpected avx opcode with immediate {op:?}"),
AvxOpcode::Vpsrlq => (0x73, 2, LegacyPrefixes::_66),
AvxOpcode::Vpsllw => (0x71, 6, LegacyPrefixes::_66),
AvxOpcode::Vpslld => (0x72, 6, LegacyPrefixes::_66),
AvxOpcode::Vpsllq => (0x73, 6, LegacyPrefixes::_66),
AvxOpcode::Vpsraw => (0x71, 4, LegacyPrefixes::_66),
AvxOpcode::Vpsrad => (0x72, 4, LegacyPrefixes::_66),
_ => panic!("unexpected rmi_r_vex opcode with immediate {op:?}"),
};
VexInstruction::new()
.length(VexVectorLength::V128)
@@ -2029,18 +2060,104 @@ pub(crate) fn emit(
}
RegMemImm::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (prefix, opcode) = match op {
AvxOpcode::Vminps => (LegacyPrefixes::None, 0x5D),
AvxOpcode::Vandnps => (LegacyPrefixes::None, 0x55),
AvxOpcode::Vorps => (LegacyPrefixes::None, 0x56),
AvxOpcode::Vpsrld => (LegacyPrefixes::_66, 0xD2),
let (prefix, map, opcode) = match op {
AvxOpcode::Vminps => (LP::None, OM::_0F, 0x5D),
AvxOpcode::Vminpd => (LP::_66, OM::_0F, 0x5D),
AvxOpcode::Vmaxps => (LP::None, OM::_0F, 0x5F),
AvxOpcode::Vmaxpd => (LP::_66, OM::_0F, 0x5F),
AvxOpcode::Vandnps => (LP::None, OM::_0F, 0x55),
AvxOpcode::Vandnpd => (LP::_66, OM::_0F, 0x55),
AvxOpcode::Vpandn => (LP::_66, OM::_0F, 0xDF),
AvxOpcode::Vpsrlw => (LP::_66, OM::_0F, 0xD1),
AvxOpcode::Vpsrld => (LP::_66, OM::_0F, 0xD2),
AvxOpcode::Vpsrlq => (LP::_66, OM::_0F, 0xD3),
AvxOpcode::Vpaddb => (LP::_66, OM::_0F, 0xFC),
AvxOpcode::Vpaddw => (LP::_66, OM::_0F, 0xFD),
AvxOpcode::Vpaddd => (LP::_66, OM::_0F, 0xFE),
AvxOpcode::Vpaddq => (LP::_66, OM::_0F, 0xD4),
AvxOpcode::Vpaddsb => (LP::_66, OM::_0F, 0xEC),
AvxOpcode::Vpaddsw => (LP::_66, OM::_0F, 0xED),
AvxOpcode::Vpaddusb => (LP::_66, OM::_0F, 0xDC),
AvxOpcode::Vpaddusw => (LP::_66, OM::_0F, 0xDD),
AvxOpcode::Vpsubb => (LP::_66, OM::_0F, 0xF8),
AvxOpcode::Vpsubw => (LP::_66, OM::_0F, 0xF9),
AvxOpcode::Vpsubd => (LP::_66, OM::_0F, 0xFA),
AvxOpcode::Vpsubq => (LP::_66, OM::_0F, 0xFB),
AvxOpcode::Vpsubsb => (LP::_66, OM::_0F, 0xE8),
AvxOpcode::Vpsubsw => (LP::_66, OM::_0F, 0xE9),
AvxOpcode::Vpsubusb => (LP::_66, OM::_0F, 0xD8),
AvxOpcode::Vpsubusw => (LP::_66, OM::_0F, 0xD9),
AvxOpcode::Vpavgb => (LP::_66, OM::_0F, 0xE0),
AvxOpcode::Vpavgw => (LP::_66, OM::_0F, 0xE3),
AvxOpcode::Vpand => (LP::_66, OM::_0F, 0xDB),
AvxOpcode::Vandps => (LP::None, OM::_0F, 0x54),
AvxOpcode::Vandpd => (LP::_66, OM::_0F, 0x54),
AvxOpcode::Vpor => (LP::_66, OM::_0F, 0xEB),
AvxOpcode::Vorps => (LP::None, OM::_0F, 0x56),
AvxOpcode::Vorpd => (LP::_66, OM::_0F, 0x56),
AvxOpcode::Vpxor => (LP::_66, OM::_0F, 0xEF),
AvxOpcode::Vxorps => (LP::None, OM::_0F, 0x57),
AvxOpcode::Vxorpd => (LP::_66, OM::_0F, 0x57),
AvxOpcode::Vpmullw => (LP::_66, OM::_0F, 0xD5),
AvxOpcode::Vpmulld => (LP::_66, OM::_0F38, 0x40),
AvxOpcode::Vpmulhw => (LP::_66, OM::_0F, 0xE5),
AvxOpcode::Vpmulhrsw => (LP::_66, OM::_0F38, 0x0B),
AvxOpcode::Vpmulhuw => (LP::_66, OM::_0F, 0xE4),
AvxOpcode::Vpmuldq => (LP::_66, OM::_0F38, 0x28),
AvxOpcode::Vpmuludq => (LP::_66, OM::_0F, 0xF4),
AvxOpcode::Vpunpckhwd => (LP::_66, OM::_0F, 0x69),
AvxOpcode::Vpunpcklwd => (LP::_66, OM::_0F, 0x61),
AvxOpcode::Vunpcklps => (LP::None, OM::_0F, 0x14),
AvxOpcode::Vaddps => (LP::None, OM::_0F, 0x58),
AvxOpcode::Vaddpd => (LP::_66, OM::_0F, 0x58),
AvxOpcode::Vsubps => (LP::None, OM::_0F, 0x5C),
AvxOpcode::Vsubpd => (LP::_66, OM::_0F, 0x5C),
AvxOpcode::Vmulps => (LP::None, OM::_0F, 0x59),
AvxOpcode::Vmulpd => (LP::_66, OM::_0F, 0x59),
AvxOpcode::Vdivps => (LP::None, OM::_0F, 0x5E),
AvxOpcode::Vdivpd => (LP::_66, OM::_0F, 0x5E),
AvxOpcode::Vpcmpeqb => (LP::_66, OM::_0F, 0x74),
AvxOpcode::Vpcmpeqw => (LP::_66, OM::_0F, 0x75),
AvxOpcode::Vpcmpeqd => (LP::_66, OM::_0F, 0x76),
AvxOpcode::Vpcmpeqq => (LP::_66, OM::_0F38, 0x29),
AvxOpcode::Vpcmpgtb => (LP::_66, OM::_0F, 0x64),
AvxOpcode::Vpcmpgtw => (LP::_66, OM::_0F, 0x65),
AvxOpcode::Vpcmpgtd => (LP::_66, OM::_0F, 0x66),
AvxOpcode::Vpcmpgtq => (LP::_66, OM::_0F38, 0x37),
AvxOpcode::Vmovlhps => (LP::None, OM::_0F, 0x16),
AvxOpcode::Vpminsb => (LP::_66, OM::_0F38, 0x38),
AvxOpcode::Vpminsw => (LP::_66, OM::_0F, 0xEA),
AvxOpcode::Vpminsd => (LP::_66, OM::_0F38, 0x39),
AvxOpcode::Vpmaxsb => (LP::_66, OM::_0F38, 0x3C),
AvxOpcode::Vpmaxsw => (LP::_66, OM::_0F, 0xEE),
AvxOpcode::Vpmaxsd => (LP::_66, OM::_0F38, 0x3D),
AvxOpcode::Vpminub => (LP::_66, OM::_0F, 0xDA),
AvxOpcode::Vpminuw => (LP::_66, OM::_0F38, 0x3A),
AvxOpcode::Vpminud => (LP::_66, OM::_0F38, 0x3B),
AvxOpcode::Vpmaxub => (LP::_66, OM::_0F, 0xDE),
AvxOpcode::Vpmaxuw => (LP::_66, OM::_0F38, 0x3E),
AvxOpcode::Vpmaxud => (LP::_66, OM::_0F38, 0x3F),
AvxOpcode::Vpunpcklbw => (LP::_66, OM::_0F, 0x60),
AvxOpcode::Vpunpckhbw => (LP::_66, OM::_0F, 0x68),
AvxOpcode::Vpacksswb => (LP::_66, OM::_0F, 0x63),
AvxOpcode::Vpackssdw => (LP::_66, OM::_0F, 0x6B),
AvxOpcode::Vpackuswb => (LP::_66, OM::_0F, 0x67),
AvxOpcode::Vpackusdw => (LP::_66, OM::_0F38, 0x2B),
AvxOpcode::Vpmaddwd => (LP::_66, OM::_0F, 0xF5),
AvxOpcode::Vpmaddubsw => (LP::_66, OM::_0F38, 0x04),
AvxOpcode::Vpshufb => (LP::_66, OM::_0F38, 0x00),
AvxOpcode::Vpsllw => (LP::_66, OM::_0F, 0xF1),
AvxOpcode::Vpslld => (LP::_66, OM::_0F, 0xF2),
AvxOpcode::Vpsllq => (LP::_66, OM::_0F, 0xF3),
AvxOpcode::Vpsraw => (LP::_66, OM::_0F, 0xE1),
AvxOpcode::Vpsrad => (LP::_66, OM::_0F, 0xE2),
_ => panic!("unexpected rmir vex opcode {op:?}"),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(map)
.opcode(opcode)
.map(OpcodeMap::_0F)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
@@ -2056,27 +2173,70 @@ pub(crate) fn emit(
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = src2.clone().to_reg_mem().with_allocs(allocs);
let (w, opcode) = match op {
AvxOpcode::Vcmpps => (false, 0xC2),
_ => unreachable!(),
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
match src2 {
RegMem::Reg { reg: src } => VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(LegacyPrefixes::None)
.map(OpcodeMap::_0F)
.w(w)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.imm(*imm)
.encode(sink),
_ => todo!(),
let (w, prefix, map, opcode) = match op {
AvxOpcode::Vcmpps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC2),
AvxOpcode::Vcmppd => (false, LegacyPrefixes::_66, OpcodeMap::_0F, 0xC2),
AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F),
AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21),
AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6),
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(map)
.w(w)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
.imm(*imm)
.encode(sink);
}
Inst::XmmVexPinsr {
op,
src1,
src2,
dst,
imm,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let (w, map, opcode) = match op {
AvxOpcode::Vpinsrb => (false, OpcodeMap::_0F3A, 0x20),
AvxOpcode::Vpinsrw => (false, OpcodeMap::_0F, 0xC4),
AvxOpcode::Vpinsrd => (false, OpcodeMap::_0F3A, 0x22),
AvxOpcode::Vpinsrq => (true, OpcodeMap::_0F3A, 0x22),
_ => panic!("unexpected vex_pinsr opcode {op:?}"),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(map)
.w(w)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
.imm(*imm)
.encode(sink);
}
Inst::XmmRmRVex3 {
@@ -2092,11 +2252,14 @@ pub(crate) fn emit(
let src2 = allocs.next(src2.to_reg());
let src3 = src3.clone().to_reg_mem().with_allocs(allocs);
let (w, opcode) = match op {
AvxOpcode::Vfmadd213ss => (false, 0xA9),
AvxOpcode::Vfmadd213sd => (true, 0xA9),
AvxOpcode::Vfmadd213ps => (false, 0xA8),
AvxOpcode::Vfmadd213pd => (true, 0xA8),
let (w, map, opcode) = match op {
AvxOpcode::Vfmadd213ss => (false, OpcodeMap::_0F38, 0xA9),
AvxOpcode::Vfmadd213sd => (true, OpcodeMap::_0F38, 0xA9),
AvxOpcode::Vfmadd213ps => (false, OpcodeMap::_0F38, 0xA8),
AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8),
AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A),
AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B),
AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C),
_ => unreachable!(),
};
@@ -2104,7 +2267,7 @@ pub(crate) fn emit(
RegMem::Reg { reg: src } => VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(OpcodeMap::_0F38)
.map(map)
.w(w)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
@@ -2115,6 +2278,42 @@ pub(crate) fn emit(
};
}
Inst::XmmRmRBlendVex {
op,
src1,
src2,
mask,
dst,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};
let mask = allocs.next(mask.to_reg());
let opcode = match op {
AvxOpcode::Vblendvps => 0x4A,
AvxOpcode::Vblendvpd => 0x4B,
AvxOpcode::Vpblendvb => 0x4C,
_ => unreachable!(),
};
VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(LegacyPrefixes::_66)
.map(OpcodeMap::_0F3A)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2)
.imm(mask.to_real_reg().unwrap().hw_enc() << 4)
.encode(sink);
}
Inst::XmmRmREvex {
op,
src1,
@@ -2259,10 +2458,7 @@ pub(crate) fn emit(
imm,
size,
} => {
let (src2, dst) = if inst.produces_const() {
let dst = allocs.next(dst.to_reg());
(RegMem::Reg { reg: dst }, dst)
} else if !op.uses_src1() {
let (src2, dst) = if !op.uses_src1() {
let dst = allocs.next(dst.to_reg());
let src2 = src2.with_allocs(allocs);
(src2, dst)

View File

@@ -4860,6 +4860,20 @@ fn test_x64_emit() {
"roundpd $0, %xmm15, %xmm15",
));
// ========================================================
// XmmRmRImmVex
insns.push((
Inst::XmmVexPinsr {
op: AvxOpcode::Vpinsrb,
dst: Writable::from_reg(Xmm::new(xmm13).unwrap()),
src1: Xmm::new(xmm14).unwrap(),
src2: GprMem::new(RegMem::reg(r15)).unwrap(),
imm: 2,
},
"C4430920EF02",
"vpinsrb $2 %xmm14, %r15, %xmm13",
));
// ========================================================
// Pertaining to atomics.
let am1: SyntheticAmode =
@@ -5135,6 +5149,7 @@ fn test_x64_emit() {
isa_flag_builder.enable("has_ssse3").unwrap();
isa_flag_builder.enable("has_sse41").unwrap();
isa_flag_builder.enable("has_fma").unwrap();
isa_flag_builder.enable("has_avx").unwrap();
isa_flag_builder.enable("has_avx512bitalg").unwrap();
isa_flag_builder.enable("has_avx512dq").unwrap();
isa_flag_builder.enable("has_avx512f").unwrap();

View File

@@ -122,7 +122,8 @@ impl Inst {
| Inst::MachOTlsGetAddr { .. }
| Inst::CoffTlsGetAddr { .. }
| Inst::Unwind { .. }
| Inst::DummyUse { .. } => smallvec![],
| Inst::DummyUse { .. }
| Inst::AluConstOp { .. } => smallvec![],
Inst::AluRmRVex { op, .. } => op.available_from(),
Inst::UnaryRmR { op, .. } => op.available_from(),
@@ -136,7 +137,8 @@ impl Inst {
| Inst::XmmRmRImm { op, .. }
| Inst::XmmToGpr { op, .. }
| Inst::XmmUnaryRmRImm { op, .. }
| Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
| Inst::XmmUnaryRmR { op, .. }
| Inst::XmmConstOp { op, .. } => smallvec![op.available_from()],
Inst::XmmUnaryRmREvex { op, .. }
| Inst::XmmRmREvex { op, .. }
@@ -144,7 +146,9 @@ impl Inst {
Inst::XmmRmiRVex { op, .. }
| Inst::XmmRmRVex3 { op, .. }
| Inst::XmmRmRImmVex { op, .. } => op.available_from(),
| Inst::XmmRmRImmVex { op, .. }
| Inst::XmmRmRBlendVex { op, .. }
| Inst::XmmVexPinsr { op, .. } => op.available_from(),
}
}
}
@@ -622,40 +626,6 @@ impl Inst {
}
}
// Inst helpers.
impl Inst {
/// In certain cases, instructions of this format can act as a definition of an XMM register,
/// producing a value that is independent of its initial value.
///
/// For example, a vector equality comparison (`cmppd` or `cmpps`) that compares a register to
/// itself will generate all ones as a result, regardless of its value. From the register
/// allocator's point of view, we should (i) record the first register, which is normally a
/// mod, as a def instead; and (ii) not record the second register as a use, because it is the
/// same as the first register (already handled).
fn produces_const(&self) -> bool {
match self {
Self::AluRmiR { op, src1, src2, .. } => {
src2.clone().to_reg_mem_imm().to_reg() == Some(src1.to_reg())
&& (*op == AluRmiROpcode::Xor || *op == AluRmiROpcode::Sub)
}
Self::XmmRmR { op, src1, src2, .. } => {
src2.clone().to_reg_mem().to_reg() == Some(src1.to_reg())
&& (*op == SseOpcode::Xorps
|| *op == SseOpcode::Xorpd
|| *op == SseOpcode::Pxor
|| *op == SseOpcode::Pcmpeqb
|| *op == SseOpcode::Pcmpeqw
|| *op == SseOpcode::Pcmpeqd
|| *op == SseOpcode::Pcmpeqq)
}
_ => false,
}
}
}
//=============================================================================
// Instructions: printing
@@ -705,16 +675,6 @@ impl PrettyPrint for Inst {
match self {
Inst::Nop { len } => format!("{} len={}", ljustify("nop".to_string()), len),
Inst::AluRmiR { size, op, dst, .. } if self.produces_const() => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
format!(
"{} {}, {}, {}",
ljustify2(op.to_string(), suffix_lqb(*size)),
dst,
dst,
dst
)
}
Inst::AluRmiR {
size,
op,
@@ -734,6 +694,14 @@ impl PrettyPrint for Inst {
dst
)
}
Inst::AluConstOp { op, dst, size } => {
let size_bytes = size.to_bytes();
let dst = pretty_print_reg(dst.to_reg().to_reg(), size_bytes, allocs);
format!(
"{} {dst}, {dst}, {dst}",
ljustify2(op.to_string(), suffix_lqb(*size)),
)
}
Inst::AluRM {
size,
op,
@@ -945,11 +913,6 @@ impl PrettyPrint for Inst {
format!("{} {}, {}", ljustify(op.to_string()), src, dst)
}
Inst::XmmRmR { op, dst, .. } if self.produces_const() => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
format!("{} {}, {}, {}", ljustify(op.to_string()), dst, dst, dst)
}
Inst::XmmRmR {
op,
src1,
@@ -963,6 +926,11 @@ impl PrettyPrint for Inst {
format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst)
}
Inst::XmmConstOp { op, dst } => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
format!("{} {dst}, {dst}, {dst}", ljustify(op.to_string()))
}
Inst::XmmRmRBlend {
op,
src1,
@@ -1016,13 +984,22 @@ impl PrettyPrint for Inst {
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
let src2 = src2.pretty_print(8, allocs);
format!(
"{} ${imm} {}, {}, {}",
ljustify(op.to_string()),
src1,
src2,
dst
)
format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string()))
}
Inst::XmmVexPinsr {
op,
src1,
src2,
dst,
imm,
..
} => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
let src2 = src2.pretty_print(8, allocs);
format!("{} ${imm} {src1}, {src2}, {dst}", ljustify(op.to_string()))
}
Inst::XmmRmRVex3 {
@@ -1048,6 +1025,22 @@ impl PrettyPrint for Inst {
)
}
Inst::XmmRmRBlendVex {
op,
src1,
src2,
mask,
dst,
..
} => {
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
let src2 = src2.pretty_print(8, allocs);
let mask = pretty_print_reg(mask.to_reg(), 8, allocs);
format!("{} {src1}, {src2}, {mask}, {dst}", ljustify(op.to_string()))
}
Inst::XmmRmREvex {
op,
src1,
@@ -1109,28 +1102,6 @@ impl PrettyPrint for Inst {
)
}
Inst::XmmRmRImm {
op, dst, imm, size, ..
} if self.produces_const() => {
let dst = pretty_print_reg(dst.to_reg(), 8, allocs);
format!(
"{} ${}, {}, {}, {}",
ljustify(format!(
"{}{}",
op.to_string(),
if *size == OperandSize::Size64 {
".w"
} else {
""
}
)),
imm,
dst,
dst,
dst,
)
}
Inst::XmmRmRImm {
op,
src1,
@@ -1799,14 +1770,11 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
Inst::AluRmiR {
src1, src2, dst, ..
} => {
if inst.produces_const() {
collector.reg_def(dst.to_writable_reg());
} else {
collector.reg_use(src1.to_reg());
collector.reg_reuse_def(dst.to_writable_reg(), 0);
src2.get_operands(collector);
}
collector.reg_use(src1.to_reg());
collector.reg_reuse_def(dst.to_writable_reg(), 0);
src2.get_operands(collector);
}
Inst::AluConstOp { dst, .. } => collector.reg_def(dst.to_writable_reg()),
Inst::AluRM { src1_dst, src2, .. } => {
collector.reg_use(src2.to_reg());
src1_dst.get_operands(collector);
@@ -1904,13 +1872,9 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
Inst::XmmRmR {
src1, src2, dst, ..
} => {
if inst.produces_const() {
collector.reg_def(dst.to_writable_reg());
} else {
collector.reg_use(src1.to_reg());
collector.reg_reuse_def(dst.to_writable_reg(), 0);
src2.get_operands(collector);
}
collector.reg_use(src1.to_reg());
collector.reg_reuse_def(dst.to_writable_reg(), 0);
src2.get_operands(collector);
}
Inst::XmmRmRBlend {
src1,
@@ -1943,6 +1907,13 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
collector.reg_use(src1.to_reg());
src2.get_operands(collector);
}
Inst::XmmVexPinsr {
src1, src2, dst, ..
} => {
collector.reg_def(dst.to_writable_reg());
collector.reg_use(src1.to_reg());
src2.get_operands(collector);
}
Inst::XmmRmRVex3 {
op,
src1,
@@ -1966,6 +1937,18 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
collector.reg_use(src2.to_reg());
src3.get_operands(collector);
}
Inst::XmmRmRBlendVex {
src1,
src2,
mask,
dst,
..
} => {
collector.reg_def(dst.to_writable_reg());
collector.reg_use(src1.to_reg());
src2.get_operands(collector);
collector.reg_use(mask.to_reg());
}
Inst::XmmRmREvex {
op,
src1,
@@ -1999,9 +1982,7 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
dst,
..
} => {
if inst.produces_const() {
collector.reg_def(*dst);
} else if !op.uses_src1() {
if !op.uses_src1() {
// FIXME: split this instruction into two, so we don't
// need this awkward src1-is-only-sometimes-an-arg
// behavior.
@@ -2013,6 +1994,9 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
src2.get_operands(collector);
}
}
Inst::XmmConstOp { dst, .. } => {
collector.reg_def(dst.to_writable_reg());
}
Inst::XmmUninitializedValue { dst } => collector.reg_def(dst.to_writable_reg()),
Inst::XmmMinMaxSeq { lhs, rhs, dst, .. } => {
collector.reg_use(rhs.to_reg());

View File

@@ -1333,11 +1333,11 @@
;; i32x4.replace_lane
(rule (vec_insert_lane $I32X4 vec val idx)
(x64_pinsrd vec val idx (OperandSize.Size32)))
(x64_pinsrd vec val idx))
;; i64x2.replace_lane
(rule (vec_insert_lane $I64X2 vec val idx)
(x64_pinsrd vec val idx (OperandSize.Size64)))
(x64_pinsrq vec val idx))
;; f32x4.replace_lane
(rule (vec_insert_lane $F32X4 vec val idx)
@@ -2982,8 +2982,8 @@
;; Rules for `fcvt_low_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (fcvt_low_from_sint a @ (value_type ty)))
(x64_cvtdq2pd ty a))
(rule (lower (fcvt_low_from_sint a))
(x64_cvtdq2pd a))
;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3148,7 +3148,7 @@
;; Converting to unsigned int so if float src is negative or NaN
;; will first set to zero.
(tmp2 Xmm (x64_pxor src src)) ;; make a zero
(tmp2 Xmm (xmm_zero $F32X4))
(dst Xmm (x64_maxps src tmp2))
;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks
@@ -3181,7 +3181,7 @@
;; that have positive overflow (based on the mask) by setting these lanes
;; to 0x7FFFFFFF
(tmp1 Xmm (x64_pxor tmp1 tmp2))
(tmp2 Xmm (x64_pxor tmp2 tmp2)) ;; make another zero
(tmp2 Xmm (xmm_zero $I32X4))
(tmp1 Xmm (x64_pmaxsd tmp1 tmp2)))
;; Add this second set of converted lanes to the original to properly handle
@@ -3611,7 +3611,7 @@
(rule (lower (has_type (multi_lane 8 16) (splat src)))
(let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0))
(zeros Xmm (x64_pxor vec vec)))
(zeros Xmm (xmm_zero $I8X16)))
;; Shuffle the lowest byte lane to all other lanes.
(x64_pshufb vec zeros)))
@@ -3661,7 +3661,7 @@
(rule (lower (vall_true val @ (value_type ty)))
(let ((src Xmm val)
(zeros Xmm (x64_pxor src src))
(zeros Xmm (xmm_zero ty))
(cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
(with_flags (x64_ptest cmp cmp) (x64_setcc (CC.Z)))))
@@ -3752,7 +3752,7 @@
;; MOVAPD xmm_y, xmm_x
;; XORPD xmm_tmp, xmm_tmp
(zeros Xmm (x64_xorpd src src))
(zeros Xmm (xmm_zero $F64X2))
(dst Xmm (x64_maxpd src zeros))
(umax_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_umax_mask)))

View File

@@ -891,12 +891,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
None
};
let dividend_hi = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
self.lower_ctx.emit(MInst::alu_rmi_r(
OperandSize::Size32,
AluRmiROpcode::Xor,
RegMemImm::reg(dividend_hi.to_reg()),
dividend_hi,
));
self.lower_ctx.emit(MInst::AluConstOp {
op: AluRmiROpcode::Xor,
size: OperandSize::Size32,
dst: WritableGpr::from_reg(Gpr::new(dividend_hi.to_reg()).unwrap()),
});
self.lower_ctx.emit(MInst::checked_div_or_rem_seq(
kind.clone(),
size,