diff --git a/cranelift/codegen/src/isa/x64/encoding/rex.rs b/cranelift/codegen/src/isa/x64/encoding/rex.rs index 41ae596eba..f48fea8306 100644 --- a/cranelift/codegen/src/isa/x64/encoding/rex.rs +++ b/cranelift/codegen/src/isa/x64/encoding/rex.rs @@ -312,18 +312,50 @@ pub(crate) fn emit_std_enc_mem( prefixes.emit(sink); + // After prefixes, first emit the REX byte depending on the kind of + // addressing mode that's being used. match *mem_e { - Amode::ImmReg { simm32, base, .. } => { - // First, the REX byte. + Amode::ImmReg { base, .. } => { let enc_e = int_reg_enc(base); rex.emit_two_op(sink, enc_g, enc_e); + } - // Now the opcode(s). These include any other prefixes the caller - // hands to us. - while num_opcodes > 0 { - num_opcodes -= 1; - sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); - } + Amode::ImmRegRegShift { + base: reg_base, + index: reg_index, + .. + } => { + let enc_base = int_reg_enc(*reg_base); + let enc_index = int_reg_enc(*reg_index); + rex.emit_three_op(sink, enc_g, enc_index, enc_base); + } + + Amode::RipRelative { .. } => { + // note REX.B = 0. + rex.emit_two_op(sink, enc_g, 0); + } + } + + // Now the opcode(s). These include any other prefixes the caller + // hands to us. + while num_opcodes > 0 { + num_opcodes -= 1; + sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); + } + + // And finally encode the mod/rm bytes and all further information. + emit_modrm_sib_disp(sink, enc_g, mem_e, bytes_at_end) +} + +pub(crate) fn emit_modrm_sib_disp( + sink: &mut MachBuffer, + enc_g: u8, + mem_e: &Amode, + bytes_at_end: u8, +) { + match *mem_e { + Amode::ImmReg { simm32, base, .. } => { + let enc_e = int_reg_enc(base); // Now the mod/rm and associated immediates. This is // significantly complicated due to the multiple special cases. @@ -377,15 +409,6 @@ pub(crate) fn emit_std_enc_mem( let enc_base = int_reg_enc(*reg_base); let enc_index = int_reg_enc(*reg_index); - // The rex byte. - rex.emit_three_op(sink, enc_g, enc_index, enc_base); - - // All other prefixes and opcodes. - while num_opcodes > 0 { - num_opcodes -= 1; - sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); - } - // modrm, SIB, immediates. if low8_will_sign_extend_to_32(simm32) && enc_index != regs::ENC_RSP { sink.put1(encode_modrm(1, enc_g & 7, 4)); @@ -401,16 +424,6 @@ pub(crate) fn emit_std_enc_mem( } Amode::RipRelative { ref target } => { - // First, the REX byte, with REX.B = 0. - rex.emit_two_op(sink, enc_g, 0); - - // Now the opcode(s). These include any other prefixes the caller - // hands to us. - while num_opcodes > 0 { - num_opcodes -= 1; - sink.put1(((opcodes >> (num_opcodes << 3)) & 0xFF) as u8); - } - // RIP-relative is mod=00, rm=101. sink.put1(encode_modrm(0, enc_g & 7, 0b101)); diff --git a/cranelift/codegen/src/isa/x64/encoding/vex.rs b/cranelift/codegen/src/isa/x64/encoding/vex.rs index 3aa9bb8d50..df2c921697 100644 --- a/cranelift/codegen/src/isa/x64/encoding/vex.rs +++ b/cranelift/codegen/src/isa/x64/encoding/vex.rs @@ -4,7 +4,10 @@ use super::evex::Register; use super::rex::{LegacyPrefixes, OpcodeMap}; use super::ByteSink; -use crate::isa::x64::encoding::rex::encode_modrm; +use crate::isa::x64::args::Amode; +use crate::isa::x64::encoding::rex; +use crate::isa::x64::inst::Inst; +use crate::machinst::MachBuffer; /// Constructs a VEX-encoded instruction using a builder pattern. This approach makes it visually /// easier to transform something the manual's syntax, `VEX.128.66.0F 73 /7 ib` to code: @@ -16,11 +19,29 @@ pub struct VexInstruction { opcode: u8, w: bool, reg: u8, - rm: Register, + rm: RegisterOrAmode, vvvv: Option, imm: Option, } +#[allow(missing_docs)] +pub enum RegisterOrAmode { + Register(Register), + Amode(Amode), +} + +impl From for RegisterOrAmode { + fn from(reg: u8) -> Self { + RegisterOrAmode::Register(reg.into()) + } +} + +impl From for RegisterOrAmode { + fn from(amode: Amode) -> Self { + RegisterOrAmode::Amode(amode) + } +} + impl Default for VexInstruction { fn default() -> Self { Self { @@ -30,7 +51,7 @@ impl Default for VexInstruction { opcode: 0x00, w: false, reg: 0x00, - rm: Register::default(), + rm: RegisterOrAmode::Register(Register::default()), vvvv: None, imm: None, } @@ -105,12 +126,12 @@ impl VexInstruction { self } - /// Set the register to use for the `rm` bits; many instructions use this as the "read from - /// register/memory" operand. Currently this does not support memory addressing (TODO).Setting - /// this affects both the ModRM byte (`rm` section) and the VEX prefix (the extension bits for - /// register encodings > 8). + /// Set the register to use for the `rm` bits; many instructions use this + /// as the "read from register/memory" operand. Setting this affects both + /// the ModRM byte (`rm` section) and the VEX prefix (the extension bits + /// for register encodings > 8). #[inline(always)] - pub fn rm(mut self, reg: impl Into) -> Self { + pub fn rm(mut self, reg: impl Into) -> Self { self.rm = reg.into(); self } @@ -150,15 +171,33 @@ impl VexInstruction { /// The X bit in encoded format (inverted). #[inline(always)] fn x_bit(&self) -> u8 { - // TODO - (!0) & 1 + let reg = match &self.rm { + RegisterOrAmode::Register(_) => 0, + RegisterOrAmode::Amode(Amode::ImmReg { .. }) => 0, + RegisterOrAmode::Amode(Amode::ImmRegRegShift { index, .. }) => { + index.to_real_reg().unwrap().hw_enc() + } + RegisterOrAmode::Amode(Amode::RipRelative { .. }) => 0, + }; + + !(reg >> 3) & 1 } /// The B bit in encoded format (inverted). #[inline(always)] fn b_bit(&self) -> u8 { - let rm: u8 = self.rm.into(); - (!(rm >> 3)) & 1 + let reg = match &self.rm { + RegisterOrAmode::Register(r) => (*r).into(), + RegisterOrAmode::Amode(Amode::ImmReg { base, .. }) => { + base.to_real_reg().unwrap().hw_enc() + } + RegisterOrAmode::Amode(Amode::ImmRegRegShift { base, .. }) => { + base.to_real_reg().unwrap().hw_enc() + } + RegisterOrAmode::Amode(Amode::RipRelative { .. }) => 0, + }; + + !(reg >> 3) & 1 } /// Is the 2 byte prefix available for this instruction? @@ -176,6 +215,7 @@ impl VexInstruction { // encoded by the three-byte form of VEX !(self.map == OpcodeMap::_0F3A || self.map == OpcodeMap::_0F38) } + /// The last byte of the 2byte and 3byte prefixes is mostly the same, share the common /// encoding logic here. #[inline(always)] @@ -225,8 +265,8 @@ impl VexInstruction { sink.put1(last_byte); } - /// Emit the VEX-encoded instruction to the code sink: - pub fn encode(&self, sink: &mut CS) { + /// Emit the VEX-encoded instruction to the provided buffer. + pub fn encode(&self, sink: &mut MachBuffer) { // 2/3 byte prefix if self.use_2byte_prefix() { self.encode_2byte_prefix(sink); @@ -237,13 +277,21 @@ impl VexInstruction { // 1 Byte Opcode sink.put1(self.opcode); - // 1 ModRM Byte - // Not all instructions use Reg as a reg, some use it as an extension of the opcode. - let rm: u8 = self.rm.into(); - sink.put1(encode_modrm(3, self.reg & 7, rm & 7)); - - // TODO: 0/1 byte SIB - // TODO: 0/1/2/4 bytes DISP + match &self.rm { + // Not all instructions use Reg as a reg, some use it as an extension + // of the opcode. + RegisterOrAmode::Register(reg) => { + let rm: u8 = (*reg).into(); + sink.put1(rex::encode_modrm(3, self.reg & 7, rm & 7)); + } + // For address-based modes reuse the logic from the `rex` module + // for the modrm and trailing bytes since VEX uses the same + // encoding. + RegisterOrAmode::Amode(amode) => { + let bytes_at_end = if self.imm.is_some() { 1 } else { 0 }; + rex::emit_modrm_sib_disp(sink, self.reg & 7, amode, bytes_at_end); + } + } // Optional 1 Byte imm if let Some(imm) = self.imm { @@ -278,8 +326,9 @@ impl Default for VexVectorLength { #[cfg(test)] mod tests { use super::*; + use crate::isa::x64::inst::args::Gpr; use crate::isa::x64::inst::regs; - use std::vec::Vec; + use crate::opts::MemFlags; #[test] fn vpslldq() { @@ -288,7 +337,7 @@ mod tests { let dst = regs::xmm1().to_real_reg().unwrap().hw_enc(); let src = regs::xmm2().to_real_reg().unwrap().hw_enc(); - let mut sink0 = Vec::new(); + let mut sink = MachBuffer::new(); VexInstruction::new() .length(VexVectorLength::V128) @@ -299,9 +348,10 @@ mod tests { .vvvv(dst) .rm(src) .imm(0x17) - .encode(&mut sink0); + .encode(&mut sink); - assert_eq!(sink0, vec![0xc5, 0xf1, 0x73, 0xfa, 0x17]); + let bytes = sink.finish().data; + assert_eq!(bytes.as_slice(), [0xc5, 0xf1, 0x73, 0xfa, 0x17]); } #[test] @@ -314,7 +364,7 @@ mod tests { let a = regs::xmm2().to_real_reg().unwrap().hw_enc(); let b = regs::xmm3().to_real_reg().unwrap().hw_enc(); let c = regs::xmm4().to_real_reg().unwrap().hw_enc(); - let mut sink0 = Vec::new(); + let mut sink = MachBuffer::new(); VexInstruction::new() .length(VexVectorLength::V128) @@ -326,9 +376,10 @@ mod tests { .vvvv(a) .rm(b) .imm_reg(c) - .encode(&mut sink0); + .encode(&mut sink); - assert_eq!(sink0, vec![0xc4, 0xe3, 0x69, 0x4b, 0xcb, 0x40]); + let bytes = sink.finish().data; + assert_eq!(bytes.as_slice(), [0xc4, 0xe3, 0x69, 0x4b, 0xcb, 0x40]); } #[test] @@ -339,7 +390,7 @@ mod tests { let dst = regs::xmm10().to_real_reg().unwrap().hw_enc(); let a = regs::xmm11().to_real_reg().unwrap().hw_enc(); let b = regs::xmm12().to_real_reg().unwrap().hw_enc(); - let mut sink0 = Vec::new(); + let mut sink = MachBuffer::new(); VexInstruction::new() .length(VexVectorLength::V256) @@ -350,8 +401,91 @@ mod tests { .vvvv(a) .rm(b) .imm(4) - .encode(&mut sink0); + .encode(&mut sink); - assert_eq!(sink0, vec![0xc4, 0x41, 0x24, 0xc2, 0xd4, 0x04]); + let bytes = sink.finish().data; + assert_eq!(bytes.as_slice(), [0xc4, 0x41, 0x24, 0xc2, 0xd4, 0x04]); + } + + #[test] + fn vandnps() { + // VEX.128.0F 55 /r + // VANDNPS xmm0, xmm1, xmm2 + + let dst = regs::xmm2().to_real_reg().unwrap().hw_enc(); + let src1 = regs::xmm1().to_real_reg().unwrap().hw_enc(); + let src2 = regs::xmm0().to_real_reg().unwrap().hw_enc(); + let mut sink = MachBuffer::new(); + + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::None) + .map(OpcodeMap::_0F) + .opcode(0x55) + .reg(dst) + .vvvv(src1) + .rm(src2) + .encode(&mut sink); + + let bytes = sink.finish().data; + assert_eq!(bytes.as_slice(), [0xc5, 0xf0, 0x55, 0xd0]); + } + + #[test] + fn vandnps_mem() { + // VEX.128.0F 55 /r + // VANDNPS 10(%r13), xmm1, xmm2 + + let dst = regs::xmm2().to_real_reg().unwrap().hw_enc(); + let src1 = regs::xmm1().to_real_reg().unwrap().hw_enc(); + let src2 = Amode::ImmReg { + base: regs::r13(), + flags: MemFlags::trusted(), + simm32: 10, + }; + let mut sink = MachBuffer::new(); + + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::None) + .map(OpcodeMap::_0F) + .opcode(0x55) + .reg(dst) + .vvvv(src1) + .rm(src2) + .encode(&mut sink); + + let bytes = sink.finish().data; + assert_eq!(bytes.as_slice(), [0xc4, 0xc1, 0x70, 0x55, 0x55, 0x0a]); + } + + #[test] + fn vandnps_more_mem() { + // VEX.128.0F 55 /r + // VANDNPS 100(%rax,%r13,4), xmm1, xmm2 + + let dst = regs::xmm2().to_real_reg().unwrap().hw_enc(); + let src1 = regs::xmm1().to_real_reg().unwrap().hw_enc(); + let src2 = Amode::ImmRegRegShift { + base: Gpr::new(regs::rax()).unwrap(), + index: Gpr::new(regs::r13()).unwrap(), + flags: MemFlags::trusted(), + simm32: 100, + shift: 2, + }; + let mut sink = MachBuffer::new(); + + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::None) + .map(OpcodeMap::_0F) + .opcode(0x55) + .reg(dst) + .vvvv(src1) + .rm(src2) + .encode(&mut sink); + + let bytes = sink.finish().data; + assert_eq!(bytes.as_slice(), [0xc4, 0xa1, 0x70, 0x55, 0x54, 0xa8, 100]); } } diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 715d9fd879..f0f3c70d06 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -227,8 +227,24 @@ (mask Xmm) (dst WritableXmm)) - ;; XMM (scalar or vector) binary op that relies on the VEX prefix. - (XmmRmRVex (op AvxOpcode) + ;; XMM (scalar or vector) binary op that relies on the VEX prefix and + ;; has two inputs. + (XmmRmiRVex (op AvxOpcode) + (src1 Xmm) + (src2 XmmMemImm) + (dst WritableXmm)) + + ;; XMM (scalar or vector) ternary op that relies on the VEX prefix and + ;; has two dynamic inputs plus one immediate input. + (XmmRmRImmVex (op AvxOpcode) + (src1 Xmm) + (src2 XmmMem) + (dst WritableXmm) + (imm u8)) + + ;; XMM (scalar or vector) ternary op that relies on the VEX prefix and + ;; has three dynamic inputs. + (XmmRmRVex3 (op AvxOpcode) (src1 Xmm) (src2 Xmm) (src3 XmmMem) @@ -1132,11 +1148,16 @@ (decl cc_nz_or_z (CC) CC) (extern extractor cc_nz_or_z cc_nz_or_z) -(type AvxOpcode extern +(type AvxOpcode (enum Vfmadd213ss Vfmadd213sd Vfmadd213ps - Vfmadd213pd)) + Vfmadd213pd + Vminps + Vorps + Vandnps + Vcmpps + Vpsrld)) (type Avx512Opcode extern (enum Vcvtudq2ps @@ -1226,6 +1247,10 @@ (decl xmm_to_xmm_mem_imm (Xmm) XmmMemImm) (extern constructor xmm_to_xmm_mem_imm xmm_to_xmm_mem_imm) +;; Convert an `XmmMem` into an `XmmMemImm`. +(decl xmm_mem_to_xmm_mem_imm (XmmMem) XmmMemImm) +(extern constructor xmm_mem_to_xmm_mem_imm xmm_mem_to_xmm_mem_imm) + ;; Allocate a new temporary GPR register. (decl temp_writable_gpr () WritableGpr) (extern constructor temp_writable_gpr temp_writable_gpr) @@ -1438,6 +1463,9 @@ (decl use_sse41 (bool) Type) (extern extractor infallible use_sse41 use_sse41) +(decl pure has_avx () bool) +(extern constructor has_avx has_avx) + ;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;; ;; Extract a constant `Imm8Reg.Imm8` from a value operand. @@ -2285,8 +2313,11 @@ ;; Helper for creating `orps` instructions. (decl x64_orps (Xmm XmmMem) Xmm) -(rule (x64_orps src1 src2) +(rule 0 (x64_orps src1 src2) (xmm_rm_r (SseOpcode.Orps) src1 src2)) +(rule 1 (x64_orps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vorps) src1 src2)) ;; Helper for creating `orpd` instructions. (decl x64_orpd (Xmm XmmMem) Xmm) @@ -2360,8 +2391,11 @@ ;; Helper for creating `andnps` instructions. (decl x64_andnps (Xmm XmmMem) Xmm) -(rule (x64_andnps src1 src2) +(rule 0 (x64_andnps src1 src2) (xmm_rm_r (SseOpcode.Andnps) src1 src2)) +(rule 1 (x64_andnps src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vandnps) src1 src2)) ;; Helper for creating `andnpd` instructions. (decl x64_andnpd (Xmm XmmMem) Xmm) @@ -2602,12 +2636,18 @@ (rule (x64_cmpp $F64X2 x y imm) (x64_cmppd x y imm)) (decl x64_cmpps (Xmm XmmMem FcmpImm) Xmm) -(rule (x64_cmpps src1 src2 imm) +(rule 0 (x64_cmpps src1 src2 imm) (xmm_rm_r_imm (SseOpcode.Cmpps) src1 src2 (encode_fcmp_imm imm) (OperandSize.Size32))) +(rule 1 (x64_cmpps src1 src2 imm) + (if-let $true (has_avx)) + (xmm_rmr_imm_vex (AvxOpcode.Vcmpps) + src1 + src2 + (encode_fcmp_imm imm))) ;; Note that `Size32` is intentional despite this being used for 64-bit ;; operations, since this presumably induces the correct encoding of the @@ -2858,8 +2898,11 @@ ;; Helper for creating `psrld` instructions. (decl x64_psrld (Xmm XmmMemImm) Xmm) -(rule (x64_psrld src1 src2) +(rule 0 (x64_psrld src1 src2) (xmm_rmi_xmm (SseOpcode.Psrld) src1 src2)) +(rule 1 (x64_psrld src1 src2) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vpsrld) src1 src2)) ;; Helper for creating `psrlq` instructions. (decl x64_psrlq (Xmm XmmMemImm) Xmm) @@ -3070,10 +3113,11 @@ ;; Helper for creating `minps` instructions. (decl x64_minps (Xmm Xmm) Xmm) -(rule (x64_minps x y) - (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmRmR (SseOpcode.Minps) x y dst)))) - dst)) +(rule 0 (x64_minps x y) + (xmm_rm_r (SseOpcode.Minps) x y)) +(rule 1 (x64_minps x y) + (if-let $true (has_avx)) + (xmm_rmir_vex (AvxOpcode.Vminps) x y)) ;; Helper for creating `minpd` instructions. (decl x64_minpd (Xmm Xmm) Xmm) @@ -3101,15 +3145,25 @@ (xmm_rm_r (SseOpcode.Maxpd) x y)) -;; Helper for creating `MInst.XmmRmRVex` instructions. -(decl xmm_rmr_vex (AvxOpcode Xmm Xmm XmmMem) Xmm) -(rule (xmm_rmr_vex op src1 src2 src3) +;; Helper for creating `MInst.XmmRmiRVex` instructions. +(decl xmm_rmir_vex (AvxOpcode Xmm XmmMemImm) Xmm) +(rule (xmm_rmir_vex op src1 src2) (let ((dst WritableXmm (temp_writable_xmm)) - (_ Unit (emit (MInst.XmmRmRVex op - src1 - src2 - src3 - dst)))) + (_ Unit (emit (MInst.XmmRmiRVex op src1 src2 dst)))) + dst)) + +;; Helper for creating `MInst.XmmRmRImmVex` instructions. +(decl xmm_rmr_imm_vex (AvxOpcode Xmm XmmMem u8) Xmm) +(rule (xmm_rmr_imm_vex op src1 src2 imm) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmRmRImmVex op src1 src2 dst imm)))) + dst)) + +;; Helper for creating `MInst.XmmRmRVex3` instructions. +(decl xmm_rmr_vex3 (AvxOpcode Xmm Xmm XmmMem) Xmm) +(rule (xmm_rmr_vex3 op src1 src2 src3) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmRmRVex3 op src1 src2 src3 dst)))) dst)) ;; Helper for creating `vfmadd213ss` instructions. @@ -3117,28 +3171,28 @@ ; but we don't support VEX memory encodings yet (decl x64_vfmadd213ss (Xmm Xmm Xmm) Xmm) (rule (x64_vfmadd213ss x y z) - (xmm_rmr_vex (AvxOpcode.Vfmadd213ss) x y z)) + (xmm_rmr_vex3 (AvxOpcode.Vfmadd213ss) x y z)) ;; Helper for creating `vfmadd213sd` instructions. ; TODO: This should have the (Xmm Xmm XmmMem) signature ; but we don't support VEX memory encodings yet (decl x64_vfmadd213sd (Xmm Xmm Xmm) Xmm) (rule (x64_vfmadd213sd x y z) - (xmm_rmr_vex (AvxOpcode.Vfmadd213sd) x y z)) + (xmm_rmr_vex3 (AvxOpcode.Vfmadd213sd) x y z)) ;; Helper for creating `vfmadd213ps` instructions. ; TODO: This should have the (Xmm Xmm XmmMem) signature ; but we don't support VEX memory encodings yet (decl x64_vfmadd213ps (Xmm Xmm Xmm) Xmm) (rule (x64_vfmadd213ps x y z) - (xmm_rmr_vex (AvxOpcode.Vfmadd213ps) x y z)) + (xmm_rmr_vex3 (AvxOpcode.Vfmadd213ps) x y z)) ;; Helper for creating `vfmadd213pd` instructions. ; TODO: This should have the (Xmm Xmm XmmMem) signature ; but we don't support VEX memory encodings yet (decl x64_vfmadd213pd (Xmm Xmm Xmm) Xmm) (rule (x64_vfmadd213pd x y z) - (xmm_rmr_vex (AvxOpcode.Vfmadd213pd) x y z)) + (xmm_rmr_vex3 (AvxOpcode.Vfmadd213pd) x y z)) ;; Helper for creating `sqrtss` instructions. @@ -3836,6 +3890,7 @@ (convert RegMemImm XmmMemImm mov_rmi_to_xmm) (convert Xmm XmmMem xmm_to_xmm_mem) (convert Xmm XmmMemImm xmm_to_xmm_mem_imm) +(convert XmmMem XmmMemImm xmm_mem_to_xmm_mem_imm) (convert XmmMem RegMem xmm_mem_to_reg_mem) (convert WritableXmm Xmm writable_xmm_to_xmm) (convert WritableXmm WritableReg writable_xmm_to_reg) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 76787bfb60..c4de650e4c 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -165,6 +165,12 @@ macro_rules! newtype_of_reg { } } + impl From<$newtype_reg_mem> for $newtype_reg_mem_imm { + fn from(r: $newtype_reg_mem) -> Self { + $newtype_reg_mem_imm(r.0.into()) + } + } + impl $newtype_reg_mem_imm { /// Construct this newtype from the given `RegMemImm`, or return /// `None` if the `RegMemImm` is not a valid instance of this @@ -631,6 +637,15 @@ impl RegMemImm { } } +impl From for RegMemImm { + fn from(rm: RegMem) -> RegMemImm { + match rm { + RegMem::Reg { reg } => RegMemImm::Reg { reg }, + RegMem::Mem { addr } => RegMemImm::Mem { addr }, + } + } +} + impl PrettyPrint for RegMemImm { fn pretty_print(&self, size: u8, allocs: &mut AllocationConsumer<'_>) -> String { match self { @@ -730,6 +745,12 @@ impl RegMem { } } +impl From for RegMem { + fn from(reg: Reg) -> RegMem { + RegMem::Reg { reg } + } +} + impl From> for RegMem { fn from(r: Writable) -> Self { RegMem::reg(r.to_reg()) @@ -884,6 +905,7 @@ pub(crate) enum InstructionSet { #[allow(dead_code)] // never constructed (yet). BMI2, FMA, + AVX, AVX512BITALG, AVX512DQ, AVX512F, @@ -1477,14 +1499,7 @@ impl fmt::Display for SseOpcode { } } -#[derive(Clone, PartialEq)] -#[allow(missing_docs)] -pub enum AvxOpcode { - Vfmadd213ss, - Vfmadd213sd, - Vfmadd213ps, - Vfmadd213pd, -} +pub use crate::isa::x64::lower::isle::generated_code::AvxOpcode; impl AvxOpcode { /// Which `InstructionSet`s support the opcode? @@ -1494,25 +1509,20 @@ impl AvxOpcode { | AvxOpcode::Vfmadd213sd | AvxOpcode::Vfmadd213ps | AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA], + AvxOpcode::Vminps + | AvxOpcode::Vorps + | AvxOpcode::Vandnps + | AvxOpcode::Vcmpps + | AvxOpcode::Vpsrld => { + smallvec![InstructionSet::AVX] + } } } } -impl fmt::Debug for AvxOpcode { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - let name = match self { - AvxOpcode::Vfmadd213ss => "vfmadd213ss", - AvxOpcode::Vfmadd213sd => "vfmadd213sd", - AvxOpcode::Vfmadd213ps => "vfmadd213ps", - AvxOpcode::Vfmadd213pd => "vfmadd213pd", - }; - write!(fmt, "{}", name) - } -} - impl fmt::Display for AvxOpcode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Debug::fmt(self, f) + format!("{self:?}").to_lowercase().fmt(f) } } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 830565eff6..9d984edab5 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -8,7 +8,7 @@ use crate::isa::x64::encoding::rex::{ low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap, RexFlags, }; -use crate::isa::x64::encoding::vex::{VexInstruction, VexVectorLength}; +use crate::isa::x64::encoding::vex::{RegisterOrAmode, VexInstruction, VexVectorLength}; use crate::isa::x64::inst::args::*; use crate::isa::x64::inst::*; use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel, Reg, Writable}; @@ -121,6 +121,7 @@ pub(crate) fn emit( InstructionSet::BMI1 => info.isa_flags.use_bmi1(), InstructionSet::BMI2 => info.isa_flags.has_bmi2(), InstructionSet::FMA => info.isa_flags.has_fma(), + InstructionSet::AVX => info.isa_flags.has_avx(), InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(), InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(), InstructionSet::AVX512F => info.isa_flags.has_avx512f(), @@ -1991,7 +1992,94 @@ pub(crate) fn emit( } } - Inst::XmmRmRVex { + Inst::XmmRmiRVex { + op, + src1, + src2, + dst, + } => { + let dst = allocs.next(dst.to_reg().to_reg()); + let src1 = allocs.next(src1.to_reg()); + let src2 = src2.clone().to_reg_mem_imm().with_allocs(allocs); + + let src2 = match src2 { + // For opcodes where one of the operands is an immediate the + // encoding is a bit different, notably the usage of + // `opcode_ext`, so handle that specially here. + RegMemImm::Imm { simm32 } => { + let (opcode, opcode_ext, prefix) = match op { + AvxOpcode::Vpsrld => (0x72, 2, LegacyPrefixes::_66), + _ => panic!("unexpected avx opcode with immediate {op:?}"), + }; + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(prefix) + .map(OpcodeMap::_0F) + .opcode(opcode) + .opcode_ext(opcode_ext) + .vvvv(dst.to_real_reg().unwrap().hw_enc()) + .prefix(LegacyPrefixes::_66) + .rm(src1.to_real_reg().unwrap().hw_enc()) + .imm(simm32.try_into().unwrap()) + .encode(sink); + return; + } + RegMemImm::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMemImm::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; + let (prefix, opcode) = match op { + AvxOpcode::Vminps => (LegacyPrefixes::None, 0x5D), + AvxOpcode::Vandnps => (LegacyPrefixes::None, 0x55), + AvxOpcode::Vorps => (LegacyPrefixes::None, 0x56), + AvxOpcode::Vpsrld => (LegacyPrefixes::_66, 0xD2), + _ => panic!("unexpected rmir vex opcode {op:?}"), + }; + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(prefix) + .opcode(opcode) + .map(OpcodeMap::_0F) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .vvvv(src1.to_real_reg().unwrap().hw_enc()) + .rm(src2) + .encode(sink); + } + + Inst::XmmRmRImmVex { + op, + src1, + src2, + dst, + imm, + } => { + let dst = allocs.next(dst.to_reg().to_reg()); + let src1 = allocs.next(src1.to_reg()); + let src2 = src2.clone().to_reg_mem().with_allocs(allocs); + + let (w, opcode) = match op { + AvxOpcode::Vcmpps => (false, 0xC2), + _ => unreachable!(), + }; + + match src2 { + RegMem::Reg { reg: src } => VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::None) + .map(OpcodeMap::_0F) + .w(w) + .opcode(opcode) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .rm(src.to_real_reg().unwrap().hw_enc()) + .vvvv(src1.to_real_reg().unwrap().hw_enc()) + .imm(*imm) + .encode(sink), + _ => todo!(), + }; + } + + Inst::XmmRmRVex3 { op, src1, src2, @@ -2009,6 +2097,7 @@ pub(crate) fn emit( AvxOpcode::Vfmadd213sd => (true, 0xA9), AvxOpcode::Vfmadd213ps => (false, 0xA8), AvxOpcode::Vfmadd213pd => (true, 0xA8), + _ => unreachable!(), }; match src3 { diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 3b9250329a..f63eb3e8e3 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3964,25 +3964,25 @@ fn test_x64_emit() { // XMM FMA insns.push(( - Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213ss, RegMem::reg(xmm2), xmm1, w_xmm0), + Inst::xmm_rmr_vex3(AvxOpcode::Vfmadd213ss, RegMem::reg(xmm2), xmm1, w_xmm0), "C4E271A9C2", "vfmadd213ss %xmm0, %xmm1, %xmm2, %xmm0", )); insns.push(( - Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213sd, RegMem::reg(xmm5), xmm4, w_xmm3), + Inst::xmm_rmr_vex3(AvxOpcode::Vfmadd213sd, RegMem::reg(xmm5), xmm4, w_xmm3), "C4E2D9A9DD", "vfmadd213sd %xmm3, %xmm4, %xmm5, %xmm3", )); insns.push(( - Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213ps, RegMem::reg(xmm2), xmm1, w_xmm0), + Inst::xmm_rmr_vex3(AvxOpcode::Vfmadd213ps, RegMem::reg(xmm2), xmm1, w_xmm0), "C4E271A8C2", "vfmadd213ps %xmm0, %xmm1, %xmm2, %xmm0", )); insns.push(( - Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213pd, RegMem::reg(xmm5), xmm4, w_xmm3), + Inst::xmm_rmr_vex3(AvxOpcode::Vfmadd213pd, RegMem::reg(xmm5), xmm4, w_xmm3), "C4E2D9A8DD", "vfmadd213pd %xmm3, %xmm4, %xmm5, %xmm3", )); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 6716ec21bf..a1056b859d 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -142,7 +142,9 @@ impl Inst { | Inst::XmmRmREvex { op, .. } | Inst::XmmRmREvex3 { op, .. } => op.available_from(), - Inst::XmmRmRVex { op, .. } => op.available_from(), + Inst::XmmRmiRVex { op, .. } + | Inst::XmmRmRVex3 { op, .. } + | Inst::XmmRmRImmVex { op, .. } => op.available_from(), } } } @@ -303,11 +305,11 @@ impl Inst { } #[cfg(test)] - pub(crate) fn xmm_rm_r_vex(op: AvxOpcode, src3: RegMem, src2: Reg, dst: Writable) -> Self { + pub(crate) fn xmm_rmr_vex3(op: AvxOpcode, src3: RegMem, src2: Reg, dst: Writable) -> Self { src3.assert_regclass_is(RegClass::Float); debug_assert!(src2.class() == RegClass::Float); debug_assert!(dst.to_reg().class() == RegClass::Float); - Inst::XmmRmRVex { + Inst::XmmRmRVex3 { op, src3: XmmMem::new(src3).unwrap(), src2: Xmm::new(src2).unwrap(), @@ -988,7 +990,42 @@ impl PrettyPrint for Inst { ) } - Inst::XmmRmRVex { + Inst::XmmRmiRVex { + op, + src1, + src2, + dst, + .. + } => { + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let src2 = src2.pretty_print(8, allocs); + + format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst) + } + + Inst::XmmRmRImmVex { + op, + src1, + src2, + dst, + imm, + .. + } => { + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let src2 = src2.pretty_print(8, allocs); + + format!( + "{} ${imm} {}, {}, {}", + ljustify(op.to_string()), + src1, + src2, + dst + ) + } + + Inst::XmmRmRVex3 { op, src1, src2, @@ -1892,7 +1929,21 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol collector.reg_reuse_def(dst.to_writable_reg(), 0); src2.get_operands(collector); } - Inst::XmmRmRVex { + Inst::XmmRmiRVex { + src1, src2, dst, .. + } => { + collector.reg_def(dst.to_writable_reg()); + collector.reg_use(src1.to_reg()); + src2.get_operands(collector); + } + Inst::XmmRmRImmVex { + src1, src2, dst, .. + } => { + collector.reg_def(dst.to_writable_reg()); + collector.reg_use(src1.to_reg()); + src2.get_operands(collector); + } + Inst::XmmRmRVex3 { op, src1, src2, diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 26766dc3d6..8adbf94465 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -185,6 +185,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { imm.encode() } + #[inline] + fn has_avx(&mut self) -> bool { + self.backend.x64_flags.has_avx() + } + #[inline] fn avx512vl_enabled(&mut self, _: Type) -> bool { self.backend.x64_flags.use_avx512vl_simd() @@ -439,6 +444,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { r.into() } + #[inline] + fn xmm_mem_to_xmm_mem_imm(&mut self, r: &XmmMem) -> XmmMemImm { + r.clone().into() + } + #[inline] fn temp_writable_gpr(&mut self) -> WritableGpr { Writable::from_reg(Gpr::new(self.temp_writable_reg(I64).to_reg()).unwrap()) diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif index ea1f1f913c..ce4e8d0e87 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif @@ -1,6 +1,6 @@ test compile precise-output set enable_simd -target x86_64 skylake +target x86_64 function %mask_from_icmp(i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16): diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif new file mode 100644 index 0000000000..eeb096b3e4 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif @@ -0,0 +1,228 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %mask_from_icmp(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fmin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vminps %xmm0, %xmm1, %xmm3 +; vminps %xmm1, %xmm0, %xmm5 +; vorps %xmm3, %xmm5, %xmm7 +; vcmpps $3 %xmm7, %xmm5, %xmm9 +; vorps %xmm7, %xmm9, %xmm11 +; vpsrld %xmm9, $10, %xmm13 +; vandnps %xmm13, %xmm11, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vminps %xmm1, %xmm0, %xmm3 +; vminps %xmm0, %xmm1, %xmm5 +; vorps %xmm5, %xmm3, %xmm7 +; vcmpunordps %xmm5, %xmm7, %xmm9 +; vorps %xmm9, %xmm7, %xmm11 +; vpsrld $0xa, %xmm9, %xmm13 +; vandnps %xmm11, %xmm13, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %or_from_memory(f32x4, i64) -> f32x4 { +block0(v0: f32x4, v1: i64): + v2 = load.f32x4 notrap aligned v1 + v3 = bor v0, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movups 0(%rdi), %xmm4 +; vorps %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movups (%rdi), %xmm4 +; vorps %xmm4, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %copysign_from_memory(i64) -> f32 { +block0(v0: i64): + v1 = f32const 0.0 + v2 = load.f32 notrap aligned v0 + v3 = fcopysign v1, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movss 0(%rdi), %xmm7 +; movl $-2147483648, %ecx +; movd %ecx, %xmm8 +; vandnps %xmm8, const(0), %xmm9 +; andps %xmm8, %xmm7, %xmm8 +; vorps %xmm9, %xmm8, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movss (%rdi), %xmm7 +; movl $0x80000000, %ecx +; movd %ecx, %xmm8 +; vandnps 0x16(%rip), %xmm8, %xmm9 +; andps %xmm7, %xmm8 +; vorps %xmm8, %xmm9, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) +; addb %al, (%rax) + +function %bor_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = bor v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vorps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vorps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %band_not_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = band_not v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vandnps %xmm1, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vandnps %xmm0, %xmm1, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %i32x4_shr(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rdi, %rcx +; andq %rcx, $31, %rcx +; movd %ecx, %xmm5 +; vpsrld %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rdi, %rcx +; andq $0x1f, %rcx +; movd %ecx, %xmm5 +; vpsrld %xmm5, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f32x4_abs(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = fabs v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pcmpeqd %xmm2, %xmm2, %xmm2 +; vpsrld %xmm2, $1, %xmm4 +; andps %xmm0, %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; pcmpeqd %xmm2, %xmm2 +; vpsrld $1, %xmm2, %xmm4 +; andps %xmm4, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif index 2c56dfd3c8..24950d7732 100644 --- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif +++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif @@ -1,6 +1,6 @@ test compile precise-output set enable_simd -target x86_64 skylake +target x86_64 function %band_f32x4(f32x4, f32x4) -> f32x4 { block0(v0: f32x4, v1: f32x4): diff --git a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif index 4bc1ac828a..ec1e4ad018 100644 --- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif +++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif @@ -4,6 +4,7 @@ test run target aarch64 target s390x set enable_simd +target x86_64 target x86_64 skylake function %iadd_i32x4(i32x4, i32x4) -> i32x4 {