diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index 729d21d121..fe8660bbaf 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -3,7 +3,6 @@ // Some variants are never constructed, but we still want them as options in the future. #![allow(dead_code)] -use crate::ir; use crate::ir::types::{F32X2, F32X4, F64X2, I16X4, I16X8, I32X2, I32X4, I64X2, I8X16, I8X8}; use crate::ir::Type; use crate::isa::aarch64::inst::*; @@ -681,30 +680,3 @@ impl VectorSize { } } } - -//============================================================================= -// Instruction sub-components: atomic memory update operations - -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -#[repr(u8)] -pub enum AtomicRMWOp { - Add, - Sub, - And, - Or, - Xor, - Xchg, -} - -impl AtomicRMWOp { - pub fn from(ir_op: ir::AtomicRmwOp) -> Self { - match ir_op { - ir::AtomicRmwOp::Add => AtomicRMWOp::Add, - ir::AtomicRmwOp::Sub => AtomicRMWOp::Sub, - ir::AtomicRmwOp::And => AtomicRMWOp::And, - ir::AtomicRmwOp::Or => AtomicRMWOp::Or, - ir::AtomicRmwOp::Xor => AtomicRMWOp::Xor, - ir::AtomicRmwOp::Xchg => AtomicRMWOp::Xchg, - } - } -} diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 32fe3aa6cf..60a81eb005 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1090,18 +1090,18 @@ impl MachInstEmit for Inst { } sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25] - if op == AtomicRMWOp::Xchg { + if op == inst_common::AtomicRmwOp::Xchg { // mov x28, x26 sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x28wr, xzr, x26)) } else { // add/sub/and/orr/eor x28, x27, x26 let bits_31_21 = match op { - AtomicRMWOp::Add => 0b100_01011_00_0, - AtomicRMWOp::Sub => 0b110_01011_00_0, - AtomicRMWOp::And => 0b100_01010_00_0, - AtomicRMWOp::Or => 0b101_01010_00_0, - AtomicRMWOp::Xor => 0b110_01010_00_0, - AtomicRMWOp::Xchg => unreachable!(), + inst_common::AtomicRmwOp::Add => 0b100_01011_00_0, + inst_common::AtomicRmwOp::Sub => 0b110_01011_00_0, + inst_common::AtomicRmwOp::And => 0b100_01010_00_0, + inst_common::AtomicRmwOp::Or => 0b101_01010_00_0, + inst_common::AtomicRmwOp::Xor => 0b110_01010_00_0, + inst_common::AtomicRmwOp::Xchg => unreachable!(), }; sink.put4(enc_arith_rrr(bits_31_21, 0b000000, x28wr, x27, x26)); } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index e2f08abb21..f8b446de31 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -4551,7 +4551,7 @@ fn test_aarch64_binemit() { insns.push(( Inst::AtomicRMW { ty: I16, - op: AtomicRMWOp::Xor, + op: inst_common::AtomicRmwOp::Xor, srcloc: None, }, "BF3B03D53B7F5F487C031ACA3C7F1848B8FFFFB5BF3B03D5", @@ -4561,7 +4561,7 @@ fn test_aarch64_binemit() { insns.push(( Inst::AtomicRMW { ty: I32, - op: AtomicRMWOp::Xchg, + op: inst_common::AtomicRmwOp::Xchg, srcloc: None, }, "BF3B03D53B7F5F88FC031AAA3C7F1888B8FFFFB5BF3B03D5", diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index b90dccd41a..b527b7dc19 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -649,7 +649,7 @@ pub enum Inst { /// x28 (wr) scratch reg; value afterwards has no meaning AtomicRMW { ty: Type, // I8, I16, I32 or I64 - op: AtomicRMWOp, + op: inst_common::AtomicRmwOp, srcloc: Option, }, diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index d399b90ed0..55b675a714 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -7,10 +7,11 @@ //! //! - Floating-point immediates (FIMM instruction). +use crate::ir; use crate::ir::condcodes::{FloatCC, IntCC}; use crate::ir::types::*; use crate::ir::Inst as IRInst; -use crate::ir::{AtomicRmwOp, InstructionData, Opcode, TrapCode, Type}; +use crate::ir::{InstructionData, Opcode, TrapCode, Type}; use crate::machinst::lower::*; use crate::machinst::*; use crate::CodegenResult; @@ -1067,7 +1068,7 @@ pub(crate) fn inst_trapcode(data: &InstructionData) -> Option { } } -pub(crate) fn inst_atomic_rmw_op(data: &InstructionData) -> Option { +pub(crate) fn inst_atomic_rmw_op(data: &InstructionData) -> Option { match data { &InstructionData::AtomicRmw { op, .. } => Some(op), _ => None, diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index b2915d024e..b52f01364d 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -21,7 +21,8 @@ use smallvec::SmallVec; use super::lower::*; -fn is_single_word_int_ty(ty: Type) -> bool { +/// This is target-word-size dependent. And it excludes booleans and reftypes. +fn is_valid_atomic_transaction_ty(ty: Type) -> bool { match ty { I8 | I16 | I32 | I64 => true, _ => false, @@ -1228,7 +1229,7 @@ pub(crate) fn lower_insn_to_regs>( let mut r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let mut r_arg2 = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let ty_access = ty.unwrap(); - assert!(is_single_word_int_ty(ty_access)); + assert!(is_valid_atomic_transaction_ty(ty_access)); let memflags = ctx.memflags(insn).expect("memory flags"); let srcloc = if !memflags.notrap() { Some(ctx.srcloc(insn)) @@ -1244,7 +1245,7 @@ pub(crate) fn lower_insn_to_regs>( ctx.emit(Inst::gen_move(Writable::from_reg(xreg(25)), r_addr, I64)); ctx.emit(Inst::gen_move(Writable::from_reg(xreg(26)), r_arg2, I64)); // Now the AtomicRMW insn itself - let op = AtomicRMWOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap()); + let op = inst_common::AtomicRmwOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap()); ctx.emit(Inst::AtomicRMW { ty: ty_access, op, @@ -1264,7 +1265,7 @@ pub(crate) fn lower_insn_to_regs>( let mut r_expected = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let mut r_replacement = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None); let ty_access = ty.unwrap(); - assert!(is_single_word_int_ty(ty_access)); + assert!(is_valid_atomic_transaction_ty(ty_access)); let memflags = ctx.memflags(insn).expect("memory flags"); let srcloc = if !memflags.notrap() { Some(ctx.srcloc(insn)) @@ -1302,7 +1303,7 @@ pub(crate) fn lower_insn_to_regs>( let r_data = get_output_reg(ctx, outputs[0]); let r_addr = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let ty_access = ty.unwrap(); - assert!(is_single_word_int_ty(ty_access)); + assert!(is_valid_atomic_transaction_ty(ty_access)); let memflags = ctx.memflags(insn).expect("memory flags"); let srcloc = if !memflags.notrap() { Some(ctx.srcloc(insn)) @@ -1321,7 +1322,7 @@ pub(crate) fn lower_insn_to_regs>( let r_data = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let r_addr = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let ty_access = ctx.input_ty(insn, 0); - assert!(is_single_word_int_ty(ty_access)); + assert!(is_valid_atomic_transaction_ty(ty_access)); let memflags = ctx.memflags(insn).expect("memory flags"); let srcloc = if !memflags.notrap() { Some(ctx.srcloc(insn)) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 343f3322d0..8690c57a4c 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1010,3 +1010,14 @@ impl OperandSize { } } } + +/// An x64 memory fence kind. +#[derive(Clone)] +pub enum FenceKind { + /// `mfence` instruction ("Memory Fence") + MFence, + /// `lfence` instruction ("Load Fence") + LFence, + /// `sfence` instruction ("Store Fence") + SFence, +} diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 9bae562c5c..b54de499c9 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -3,7 +3,7 @@ use crate::ir::immediates::{Ieee32, Ieee64}; use crate::ir::TrapCode; use crate::isa::x64::inst::args::*; use crate::isa::x64::inst::*; -use crate::machinst::{MachBuffer, MachInstEmit, MachLabel}; +use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel}; use core::convert::TryInto; use log::debug; use regalloc::{Reg, RegClass, Writable}; @@ -118,25 +118,38 @@ impl RexFlags { } } -/// For specifying the legacy prefixes (or `None` if no prefix required) to -/// be used at the start an instruction. A given prefix may be required for -/// various operations, including instructions that operate on GPR, SSE, and Vex -/// registers. -enum LegacyPrefix { +/// We may need to include one or more legacy prefix bytes before the REX prefix. This enum +/// covers only the small set of possibilities that we actually need. +enum LegacyPrefixes { + /// No prefix bytes None, + /// Operand Size Override -- here, denoting "16-bit operation" _66, + /// The Lock prefix + _F0, + /// Operand size override and Lock + _66F0, + /// REPNE, but no specific meaning here -- is just an opcode extension _F2, + /// REP/REPE, but no specific meaning here -- is just an opcode extension _F3, } -impl LegacyPrefix { +impl LegacyPrefixes { #[inline(always)] fn emit(&self, sink: &mut MachBuffer) { match self { - LegacyPrefix::_66 => sink.put1(0x66), - LegacyPrefix::_F2 => sink.put1(0xF2), - LegacyPrefix::_F3 => sink.put1(0xF3), - LegacyPrefix::None => (), + LegacyPrefixes::_66 => sink.put1(0x66), + LegacyPrefixes::_F0 => sink.put1(0xF0), + LegacyPrefixes::_66F0 => { + // I don't think the order matters, but in any case, this is the same order that + // the GNU assembler uses. + sink.put1(0x66); + sink.put1(0xF0); + } + LegacyPrefixes::_F2 => sink.put1(0xF2), + LegacyPrefixes::_F3 => sink.put1(0xF3), + LegacyPrefixes::None => (), } } } @@ -145,15 +158,16 @@ impl LegacyPrefix { /// /// For an instruction that has as operands a reg encoding `enc_g` and a memory address `mem_e`, /// create and emit: -/// - first the REX prefix, +/// - first the legacy prefixes, if any +/// - then the REX prefix, if needed /// - then caller-supplied opcode byte(s) (`opcodes` and `num_opcodes`), /// - then the MOD/RM byte, /// - then optionally, a SIB byte, /// - and finally optionally an immediate that will be derived from the `mem_e` operand. /// /// For most instructions up to and including SSE4.2, that will be the whole instruction: this is -/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX instructions -/// will require their own emitter functions. +/// what we call "standard" instructions, and abbreviate "std" in the name here. VEX-prefixed +/// instructions will require their own emitter functions. /// /// This will also work for 32-bits x86 instructions, assuming no REX prefix is provided. /// @@ -168,7 +182,7 @@ impl LegacyPrefix { /// indicate a 64-bit operation. fn emit_std_enc_mem( sink: &mut MachBuffer, - prefix: LegacyPrefix, + prefixes: LegacyPrefixes, opcodes: u32, mut num_opcodes: usize, enc_g: u8, @@ -179,7 +193,7 @@ fn emit_std_enc_mem( // 64-bit integer registers, because they are part of an address // expression. But `enc_g` can be derived from a register of any class. - prefix.emit(sink); + prefixes.emit(sink); match mem_e { Amode::ImmReg { simm32, base } => { @@ -304,7 +318,7 @@ fn emit_std_enc_mem( /// operand is a register rather than memory. Hence it is much simpler. fn emit_std_enc_enc( sink: &mut MachBuffer, - prefix: LegacyPrefix, + prefixes: LegacyPrefixes, opcodes: u32, mut num_opcodes: usize, enc_g: u8, @@ -316,8 +330,8 @@ fn emit_std_enc_enc( // integer-to-FP conversion insn, one might be RegClass::I64 and the other // RegClass::V128. - // The operand-size override. - prefix.emit(sink); + // The legacy prefixes. + prefixes.emit(sink); // The rex byte. rex.emit_two_op(sink, enc_g, enc_e); @@ -338,7 +352,7 @@ fn emit_std_enc_enc( fn emit_std_reg_mem( sink: &mut MachBuffer, - prefix: LegacyPrefix, + prefixes: LegacyPrefixes, opcodes: u32, num_opcodes: usize, reg_g: Reg, @@ -346,12 +360,12 @@ fn emit_std_reg_mem( rex: RexFlags, ) { let enc_g = reg_enc(reg_g); - emit_std_enc_mem(sink, prefix, opcodes, num_opcodes, enc_g, mem_e, rex); + emit_std_enc_mem(sink, prefixes, opcodes, num_opcodes, enc_g, mem_e, rex); } fn emit_std_reg_reg( sink: &mut MachBuffer, - prefix: LegacyPrefix, + prefixes: LegacyPrefixes, opcodes: u32, num_opcodes: usize, reg_g: Reg, @@ -360,7 +374,7 @@ fn emit_std_reg_reg( ) { let enc_g = reg_enc(reg_g); let enc_e = reg_enc(reg_e); - emit_std_enc_enc(sink, prefix, opcodes, num_opcodes, enc_g, enc_e, rex); + emit_std_enc_enc(sink, prefixes, opcodes, num_opcodes, enc_g, enc_e, rex); } /// Write a suitable number of bits from an imm64 to the sink. @@ -481,7 +495,7 @@ pub(crate) fn emit( RegMemImm::Reg { reg: reg_e } => { emit_std_reg_reg( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0x0FAF, 2, reg_g.to_reg(), @@ -493,7 +507,7 @@ pub(crate) fn emit( RegMemImm::Mem { addr } => { emit_std_reg_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0x0FAF, 2, reg_g.to_reg(), @@ -508,7 +522,7 @@ pub(crate) fn emit( // Yes, really, reg_g twice. emit_std_reg_reg( sink, - LegacyPrefix::None, + LegacyPrefixes::None, opcode, 1, reg_g.to_reg(), @@ -535,7 +549,7 @@ pub(crate) fn emit( // code easily. emit_std_reg_reg( sink, - LegacyPrefix::None, + LegacyPrefixes::None, opcode_r, 1, *reg_e, @@ -550,7 +564,7 @@ pub(crate) fn emit( // Here we revert to the "normal" G-E ordering. emit_std_reg_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, opcode_m, 1, reg_g.to_reg(), @@ -566,7 +580,7 @@ pub(crate) fn emit( let enc_g = int_reg_enc(reg_g.to_reg()); emit_std_enc_enc( sink, - LegacyPrefix::None, + LegacyPrefixes::None, opcode, 1, subopcode_i, @@ -581,9 +595,9 @@ pub(crate) fn emit( Inst::UnaryRmR { size, op, src, dst } => { let (prefix, rex_flags) = match size { - 2 => (LegacyPrefix::_66, RexFlags::clear_w()), - 4 => (LegacyPrefix::None, RexFlags::clear_w()), - 8 => (LegacyPrefix::None, RexFlags::set_w()), + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), _ => unreachable!(), }; @@ -621,9 +635,9 @@ pub(crate) fn emit( loc, } => { let (prefix, rex_flags) = match size { - 2 => (LegacyPrefix::_66, RexFlags::clear_w()), - 4 => (LegacyPrefix::None, RexFlags::clear_w()), - 8 => (LegacyPrefix::None, RexFlags::set_w()), + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), _ => unreachable!(), }; @@ -649,9 +663,9 @@ pub(crate) fn emit( Inst::MulHi { size, signed, rhs } => { let (prefix, rex_flags) = match size { - 2 => (LegacyPrefix::_66, RexFlags::clear_w()), - 4 => (LegacyPrefix::None, RexFlags::clear_w()), - 8 => (LegacyPrefix::None, RexFlags::set_w()), + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), _ => unreachable!(), }; @@ -826,7 +840,7 @@ pub(crate) fn emit( } else { RexFlags::clear_w() }; - emit_std_reg_reg(sink, LegacyPrefix::None, 0x89, 1, *src, dst.to_reg(), rex); + emit_std_reg_reg(sink, LegacyPrefixes::None, 0x89, 1, *src, dst.to_reg(), rex); } Inst::MovZX_RM_R { @@ -880,7 +894,7 @@ pub(crate) fn emit( } emit_std_reg_reg( sink, - LegacyPrefix::None, + LegacyPrefixes::None, opcodes, num_opcodes, dst.to_reg(), @@ -899,7 +913,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, opcodes, num_opcodes, dst.to_reg(), @@ -920,7 +934,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0x8B, 1, dst.to_reg(), @@ -931,7 +945,7 @@ pub(crate) fn emit( Inst::LoadEffectiveAddress { addr, dst } => emit_std_reg_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0x8D, 1, dst.to_reg(), @@ -982,7 +996,7 @@ pub(crate) fn emit( } emit_std_reg_reg( sink, - LegacyPrefix::None, + LegacyPrefixes::None, opcodes, num_opcodes, dst.to_reg(), @@ -1001,7 +1015,7 @@ pub(crate) fn emit( emit_std_reg_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, opcodes, num_opcodes, dst.to_reg(), @@ -1038,14 +1052,14 @@ pub(crate) fn emit( }; // MOV r8, r/m8 is (REX.W==0) 88 /r - emit_std_reg_mem(sink, LegacyPrefix::None, 0x88, 1, *src, dst, rex) + emit_std_reg_mem(sink, LegacyPrefixes::None, 0x88, 1, *src, dst, rex) } 2 => { // MOV r16, r/m16 is 66 (REX.W==0) 89 /r emit_std_reg_mem( sink, - LegacyPrefix::_66, + LegacyPrefixes::_66, 0x89, 1, *src, @@ -1058,7 +1072,7 @@ pub(crate) fn emit( // MOV r32, r/m32 is (REX.W==0) 89 /r emit_std_reg_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0x89, 1, *src, @@ -1071,7 +1085,7 @@ pub(crate) fn emit( // MOV r64, r/m64 is (REX.W==1) 89 /r emit_std_reg_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0x89, 1, *src, @@ -1109,7 +1123,7 @@ pub(crate) fn emit( None => { // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode - emit_std_enc_enc(sink, LegacyPrefix::None, 0xD3, 1, subopcode, enc_dst, rex); + emit_std_enc_enc(sink, LegacyPrefixes::None, 0xD3, 1, subopcode, enc_dst, rex); } Some(num_bits) => { @@ -1117,7 +1131,7 @@ pub(crate) fn emit( // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib // When the shift amount is 1, there's an even shorter encoding, but we don't // bother with that nicety here. - emit_std_enc_enc(sink, LegacyPrefix::None, 0xC1, 1, subopcode, enc_dst, rex); + emit_std_enc_enc(sink, LegacyPrefixes::None, 0xC1, 1, subopcode, enc_dst, rex); sink.put1(*num_bits); } } @@ -1125,7 +1139,7 @@ pub(crate) fn emit( Inst::XmmRmiReg { opcode, src, dst } => { let rex = RexFlags::clear_w(); - let prefix = LegacyPrefix::_66; + let prefix = LegacyPrefixes::_66; if let RegMemImm::Imm { simm32 } = src { let (opcode_bytes, reg_digit) = match opcode { SseOpcode::Psllw => (0x0F71, 6), @@ -1175,9 +1189,9 @@ pub(crate) fn emit( src: src_e, dst: reg_g, } => { - let mut prefix = LegacyPrefix::None; + let mut prefix = LegacyPrefixes::None; if *size == 2 { - prefix = LegacyPrefix::_66; + prefix = LegacyPrefixes::_66; } let mut rex = match size { @@ -1245,7 +1259,7 @@ pub(crate) fn emit( rex_flags.always_emit(); emit_std_enc_enc( sink, - LegacyPrefix::None, + LegacyPrefixes::None, opcode, 2, 0, @@ -1261,9 +1275,9 @@ pub(crate) fn emit( dst: reg_g, } => { let (prefix, rex_flags) = match size { - 2 => (LegacyPrefix::_66, RexFlags::clear_w()), - 4 => (LegacyPrefix::None, RexFlags::clear_w()), - 8 => (LegacyPrefix::None, RexFlags::set_w()), + 2 => (LegacyPrefixes::_66, RexFlags::clear_w()), + 4 => (LegacyPrefixes::None, RexFlags::clear_w()), + 8 => (LegacyPrefixes::None, RexFlags::set_w()), _ => unreachable!("invalid size spec for cmove"), }; let opcode = 0x0F40 + cc.get_enc() as u32; @@ -1315,7 +1329,7 @@ pub(crate) fn emit( let addr = &addr.finalize(state); emit_std_enc_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0xFF, 1, 6, /*subopcode*/ @@ -1371,7 +1385,7 @@ pub(crate) fn emit( let reg_enc = int_reg_enc(*reg); emit_std_enc_enc( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0xFF, 1, 2, /*subopcode*/ @@ -1384,7 +1398,7 @@ pub(crate) fn emit( let addr = &addr.finalize(state); emit_std_enc_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0xFF, 1, 2, /*subopcode*/ @@ -1461,7 +1475,7 @@ pub(crate) fn emit( let reg_enc = int_reg_enc(*reg); emit_std_enc_enc( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0xFF, 1, 4, /*subopcode*/ @@ -1474,7 +1488,7 @@ pub(crate) fn emit( let addr = &addr.finalize(state); emit_std_enc_mem( sink, - LegacyPrefix::None, + LegacyPrefixes::None, 0xFF, 1, 4, /*subopcode*/ @@ -1596,20 +1610,20 @@ pub(crate) fn emit( let rex = RexFlags::clear_w(); let (prefix, opcode) = match op { - SseOpcode::Cvtss2sd => (LegacyPrefix::_F3, 0x0F5A), - SseOpcode::Cvtsd2ss => (LegacyPrefix::_F2, 0x0F5A), - SseOpcode::Movaps => (LegacyPrefix::None, 0x0F28), - SseOpcode::Movapd => (LegacyPrefix::_66, 0x0F28), - SseOpcode::Movdqa => (LegacyPrefix::_66, 0x0F6F), - SseOpcode::Movdqu => (LegacyPrefix::_F3, 0x0F6F), - SseOpcode::Movsd => (LegacyPrefix::_F2, 0x0F10), - SseOpcode::Movss => (LegacyPrefix::_F3, 0x0F10), - SseOpcode::Movups => (LegacyPrefix::None, 0x0F10), - SseOpcode::Movupd => (LegacyPrefix::_66, 0x0F10), - SseOpcode::Sqrtps => (LegacyPrefix::None, 0x0F51), - SseOpcode::Sqrtpd => (LegacyPrefix::_66, 0x0F51), - SseOpcode::Sqrtss => (LegacyPrefix::_F3, 0x0F51), - SseOpcode::Sqrtsd => (LegacyPrefix::_F2, 0x0F51), + SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A), + SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A), + SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28), + SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F28), + SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F6F), + SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F6F), + SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10), + SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10), + SseOpcode::Movups => (LegacyPrefixes::None, 0x0F10), + SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F10), + SseOpcode::Sqrtps => (LegacyPrefixes::None, 0x0F51), + SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51), + SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51), + SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51), _ => unimplemented!("Opcode {:?} not implemented", op), }; @@ -1635,49 +1649,49 @@ pub(crate) fn emit( } => { let rex = RexFlags::clear_w(); let (prefix, opcode, length) = match op { - SseOpcode::Addps => (LegacyPrefix::None, 0x0F58, 2), - SseOpcode::Addpd => (LegacyPrefix::_66, 0x0F58, 2), - SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58, 2), - SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58, 2), - SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54, 2), - SseOpcode::Andps => (LegacyPrefix::None, 0x0F54, 2), - SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55, 2), - SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55, 2), - SseOpcode::Divps => (LegacyPrefix::None, 0x0F5E, 2), - SseOpcode::Divpd => (LegacyPrefix::_66, 0x0F5E, 2), - SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E, 2), - SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E, 2), - SseOpcode::Minps => (LegacyPrefix::None, 0x0F5D, 2), - SseOpcode::Minpd => (LegacyPrefix::_66, 0x0F5D, 2), - SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D, 2), - SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D, 2), - SseOpcode::Maxps => (LegacyPrefix::None, 0x0F5F, 2), - SseOpcode::Maxpd => (LegacyPrefix::_66, 0x0F5F, 2), - SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F, 2), - SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F, 2), - SseOpcode::Mulps => (LegacyPrefix::None, 0x0F59, 2), - SseOpcode::Mulpd => (LegacyPrefix::_66, 0x0F59, 2), - SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59, 2), - SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59, 2), - SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56, 2), - SseOpcode::Orps => (LegacyPrefix::None, 0x0F56, 2), - SseOpcode::Paddb => (LegacyPrefix::_66, 0x0FFC, 2), - SseOpcode::Paddd => (LegacyPrefix::_66, 0x0FFE, 2), - SseOpcode::Paddq => (LegacyPrefix::_66, 0x0FD4, 2), - SseOpcode::Paddw => (LegacyPrefix::_66, 0x0FFD, 2), - SseOpcode::Pmulld => (LegacyPrefix::_66, 0x0F3840, 3), - SseOpcode::Pmullw => (LegacyPrefix::_66, 0x0FD5, 2), - SseOpcode::Pmuludq => (LegacyPrefix::_66, 0x0FF4, 2), - SseOpcode::Psubb => (LegacyPrefix::_66, 0x0FF8, 2), - SseOpcode::Psubd => (LegacyPrefix::_66, 0x0FFA, 2), - SseOpcode::Psubq => (LegacyPrefix::_66, 0x0FFB, 2), - SseOpcode::Psubw => (LegacyPrefix::_66, 0x0FF9, 2), - SseOpcode::Subps => (LegacyPrefix::None, 0x0F5C, 2), - SseOpcode::Subpd => (LegacyPrefix::_66, 0x0F5C, 2), - SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C, 2), - SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C, 2), - SseOpcode::Xorps => (LegacyPrefix::None, 0x0F57, 2), - SseOpcode::Xorpd => (LegacyPrefix::_66, 0x0F57, 2), + SseOpcode::Addps => (LegacyPrefixes::None, 0x0F58, 2), + SseOpcode::Addpd => (LegacyPrefixes::_66, 0x0F58, 2), + SseOpcode::Addss => (LegacyPrefixes::_F3, 0x0F58, 2), + SseOpcode::Addsd => (LegacyPrefixes::_F2, 0x0F58, 2), + SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2), + SseOpcode::Andps => (LegacyPrefixes::None, 0x0F54, 2), + SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2), + SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2), + SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2), + SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2), + SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2), + SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2), + SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2), + SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2), + SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2), + SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2), + SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2), + SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2), + SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2), + SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2), + SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2), + SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2), + SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2), + SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2), + SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2), + SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2), + SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2), + SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2), + SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2), + SseOpcode::Paddw => (LegacyPrefixes::_66, 0x0FFD, 2), + SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3), + SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2), + SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2), + SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2), + SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2), + SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2), + SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2), + SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2), + SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2), + SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2), + SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2), + SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2), + SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2), _ => unimplemented!("Opcode {:?} not implemented", op), }; @@ -1780,10 +1794,10 @@ pub(crate) fn emit( Inst::XmmRmRImm { op, src, dst, imm } => { let prefix = match op { - SseOpcode::Cmpps => LegacyPrefix::None, - SseOpcode::Cmppd => LegacyPrefix::_66, - SseOpcode::Cmpss => LegacyPrefix::_F3, - SseOpcode::Cmpsd => LegacyPrefix::_F2, + SseOpcode::Cmpps => LegacyPrefixes::None, + SseOpcode::Cmppd => LegacyPrefixes::_66, + SseOpcode::Cmpss => LegacyPrefixes::_F3, + SseOpcode::Cmpsd => LegacyPrefixes::_F2, _ => unimplemented!("Opcode {:?} not implemented", op), }; let opcode = 0x0FC2; @@ -1833,14 +1847,14 @@ pub(crate) fn emit( srcloc, } => { let (prefix, opcode) = match op { - SseOpcode::Movaps => (LegacyPrefix::None, 0x0F29), - SseOpcode::Movapd => (LegacyPrefix::_66, 0x0F29), - SseOpcode::Movdqa => (LegacyPrefix::_66, 0x0F7F), - SseOpcode::Movdqu => (LegacyPrefix::_F3, 0x0F7F), - SseOpcode::Movss => (LegacyPrefix::_F3, 0x0F11), - SseOpcode::Movsd => (LegacyPrefix::_F2, 0x0F11), - SseOpcode::Movups => (LegacyPrefix::None, 0x0F11), - SseOpcode::Movupd => (LegacyPrefix::_66, 0x0F11), + SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29), + SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29), + SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F7F), + SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F), + SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11), + SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11), + SseOpcode::Movups => (LegacyPrefixes::None, 0x0F11), + SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F11), _ => unimplemented!("Opcode {:?} not implemented", op), }; let dst = &dst.finalize(state); @@ -1860,9 +1874,9 @@ pub(crate) fn emit( let (prefix, opcode, dst_first) = match op { // Movd and movq use the same opcode; the presence of the REX prefix (set below) // actually determines which is used. - SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefix::_66, 0x0F7E, false), - SseOpcode::Cvttss2si => (LegacyPrefix::_F3, 0x0F2C, true), - SseOpcode::Cvttsd2si => (LegacyPrefix::_F2, 0x0F2C, true), + SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false), + SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true), + SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true), _ => panic!("unexpected opcode {:?}", op), }; let rex = match dst_size { @@ -1888,9 +1902,9 @@ pub(crate) fn emit( let (prefix, opcode) = match op { // Movd and movq use the same opcode; the presence of the REX prefix (set below) // actually determines which is used. - SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefix::_66, 0x0F6E), - SseOpcode::Cvtsi2ss => (LegacyPrefix::_F3, 0x0F2A), - SseOpcode::Cvtsi2sd => (LegacyPrefix::_F2, 0x0F2A), + SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E), + SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A), + SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A), _ => panic!("unexpected opcode {:?}", op), }; let rex = match *src_size { @@ -1911,8 +1925,8 @@ pub(crate) fn emit( Inst::XMM_Cmp_RM_R { op, src, dst } => { let rex = RexFlags::clear_w(); let (prefix, opcode) = match op { - SseOpcode::Ucomisd => (LegacyPrefix::_66, 0x0F2E), - SseOpcode::Ucomiss => (LegacyPrefix::None, 0x0F2E), + SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E), + SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E), _ => unimplemented!("Emit xmm cmp rm r"), }; @@ -2431,6 +2445,113 @@ pub(crate) fn emit( } } + Inst::LockCmpxchg { + ty, + src, + dst, + srcloc, + } => { + if let Some(srcloc) = srcloc { + sink.add_trap(*srcloc, TrapCode::HeapOutOfBounds); + } + // lock cmpxchg{b,w,l,q} %src, (dst) + // Note that 0xF0 is the Lock prefix. + let (prefix, rex, opcodes) = match *ty { + types::I8 => { + let mut rex_flags = RexFlags::clear_w(); + let enc_src = int_reg_enc(*src); + if enc_src >= 4 && enc_src <= 7 { + rex_flags.always_emit(); + }; + (LegacyPrefixes::_F0, rex_flags, 0x0FB0) + } + types::I16 => (LegacyPrefixes::_66F0, RexFlags::clear_w(), 0x0FB1), + types::I32 => (LegacyPrefixes::_F0, RexFlags::clear_w(), 0x0FB1), + types::I64 => (LegacyPrefixes::_F0, RexFlags::set_w(), 0x0FB1), + _ => unreachable!(), + }; + emit_std_reg_mem(sink, prefix, opcodes, 2, *src, &dst.finalize(state), rex); + } + + Inst::AtomicRmwSeq { ty, op, srcloc } => { + // Emit this: + // + // mov{zbq,zwq,zlq,q} (%r9), %rax // rax = old value + // again: + // movq %rax, %r11 // rax = old value, r11 = old value + // `op`q %r10, %r11 // rax = old value, r11 = new value + // lock cmpxchg{b,w,l,q} %r11, (%r9) // try to store new value + // jnz again // If this is taken, rax will have a "revised" old value + // + // Operand conventions: + // IN: %r9 (addr), %r10 (2nd arg for `op`) + // OUT: %rax (old value), %r11 (trashed), %rflags (trashed) + // + // In the case where the operation is 'xchg', the "`op`q" instruction is instead + // movq %r10, %r11 + // so that we simply write in the destination, the "2nd arg for `op`". + let rax = regs::rax(); + let r9 = regs::r9(); + let r10 = regs::r10(); + let r11 = regs::r11(); + let rax_w = Writable::from_reg(rax); + let r11_w = Writable::from_reg(r11); + let amode = Amode::imm_reg(0, r9); + let again_label = sink.get_label(); + + // mov{zbq,zwq,zlq,q} (%r9), %rax + // No need to call `add_trap` here, since the `i1` emit will do that. + let i1 = Inst::load(*ty, amode.clone(), rax_w, ExtKind::ZeroExtend, *srcloc); + i1.emit(sink, flags, state); + + // again: + sink.bind_label(again_label); + + // movq %rax, %r11 + let i2 = Inst::mov_r_r(true, rax, r11_w); + i2.emit(sink, flags, state); + + // opq %r10, %r11 + let r10_rmi = RegMemImm::reg(r10); + let i3 = if *op == inst_common::AtomicRmwOp::Xchg { + Inst::mov_r_r(true, r10, r11_w) + } else { + let alu_op = match op { + inst_common::AtomicRmwOp::Add => AluRmiROpcode::Add, + inst_common::AtomicRmwOp::Sub => AluRmiROpcode::Sub, + inst_common::AtomicRmwOp::And => AluRmiROpcode::And, + inst_common::AtomicRmwOp::Or => AluRmiROpcode::Or, + inst_common::AtomicRmwOp::Xor => AluRmiROpcode::Xor, + inst_common::AtomicRmwOp::Xchg => unreachable!(), + }; + Inst::alu_rmi_r(true, alu_op, r10_rmi, r11_w) + }; + i3.emit(sink, flags, state); + + // lock cmpxchg{b,w,l,q} %r11, (%r9) + // No need to call `add_trap` here, since the `i4` emit will do that. + let i4 = Inst::LockCmpxchg { + ty: *ty, + src: r11, + dst: amode.into(), + srcloc: *srcloc, + }; + i4.emit(sink, flags, state); + + // jnz again + one_way_jmp(sink, CC::NZ, again_label); + } + + Inst::Fence { kind } => { + sink.put1(0x0F); + sink.put1(0xAE); + match kind { + FenceKind::MFence => sink.put1(0xF0), // mfence = 0F AE F0 + FenceKind::LFence => sink.put1(0xE8), // lfence = 0F AE E8 + FenceKind::SFence => sink.put1(0xF8), // sfence = 0F AE F8 + } + } + Inst::Hlt => { sink.put1(0xcc); } diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index e0f2ea1acd..cb1a6b855a 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -4,10 +4,13 @@ //! //! to see stdout: cargo test -- --nocapture //! -//! for this specific case: +//! for this specific case, as of 24 Aug 2020: //! -//! (cd cranelift/codegen && \ -//! RUST_BACKTRACE=1 cargo test isa::x64::inst::test_x64_insn_encoding_and_printing -- --nocapture) +//! cd to the top of your wasmtime tree, then: +//! RUST_BACKTRACE=1 cargo test --features test-programs/test_programs \ +//! --features experimental_x64 --all --exclude peepmatic --exclude lightbeam \ +//! --exclude wasmtime-lightbeam --exclude peepmatic-automata --exclude peepmatic-fuzzing \ +//! --exclude peepmatic-macro -- isa::x64::inst::emit_tests::test_x64_emit use super::*; use crate::isa::test_utils; @@ -3272,6 +3275,174 @@ fn test_x64_emit() { "cmpps $0, %xmm15, %xmm7", )); + // ======================================================== + // Pertaining to atomics. + let am1: SyntheticAmode = Amode::imm_reg_reg_shift(321, r10, rdx, 2).into(); + // `am2` doesn't contribute any 1 bits to the rex prefix, so we must use it when testing + // for retention of the apparently-redundant rex prefix in the 8-bit case. + let am2: SyntheticAmode = Amode::imm_reg_reg_shift(-12345i32 as u32, rcx, rsi, 3).into(); + + // A general 8-bit case. + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: rbx, + dst: am1, + srcloc: None, + }, + "F0410FB09C9241010000", + "lock cmpxchgb %bl, 321(%r10,%rdx,4)", + )); + // Check redundant rex retention in 8-bit cases. + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: rdx, + dst: am2.clone(), + srcloc: None, + }, + "F00FB094F1C7CFFFFF", + "lock cmpxchgb %dl, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: rsi, + dst: am2.clone(), + srcloc: None, + }, + "F0400FB0B4F1C7CFFFFF", + "lock cmpxchgb %sil, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: r10, + dst: am2.clone(), + srcloc: None, + }, + "F0440FB094F1C7CFFFFF", + "lock cmpxchgb %r10b, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I8, + src: r15, + dst: am2.clone(), + srcloc: None, + }, + "F0440FB0BCF1C7CFFFFF", + "lock cmpxchgb %r15b, -12345(%rcx,%rsi,8)", + )); + // 16 bit cases + insns.push(( + Inst::LockCmpxchg { + ty: types::I16, + src: rsi, + dst: am2.clone(), + srcloc: None, + }, + "66F00FB1B4F1C7CFFFFF", + "lock cmpxchgw %si, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I16, + src: r10, + dst: am2.clone(), + srcloc: None, + }, + "66F0440FB194F1C7CFFFFF", + "lock cmpxchgw %r10w, -12345(%rcx,%rsi,8)", + )); + // 32 bit cases + insns.push(( + Inst::LockCmpxchg { + ty: types::I32, + src: rsi, + dst: am2.clone(), + srcloc: None, + }, + "F00FB1B4F1C7CFFFFF", + "lock cmpxchgl %esi, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I32, + src: r10, + dst: am2.clone(), + srcloc: None, + }, + "F0440FB194F1C7CFFFFF", + "lock cmpxchgl %r10d, -12345(%rcx,%rsi,8)", + )); + // 64 bit cases + insns.push(( + Inst::LockCmpxchg { + ty: types::I64, + src: rsi, + dst: am2.clone(), + srcloc: None, + }, + "F0480FB1B4F1C7CFFFFF", + "lock cmpxchgq %rsi, -12345(%rcx,%rsi,8)", + )); + insns.push(( + Inst::LockCmpxchg { + ty: types::I64, + src: r10, + dst: am2.clone(), + srcloc: None, + }, + "F04C0FB194F1C7CFFFFF", + "lock cmpxchgq %r10, -12345(%rcx,%rsi,8)", + )); + + // AtomicRmwSeq + insns.push(( + Inst::AtomicRmwSeq { ty: types::I8, op: inst_common::AtomicRmwOp::Or, srcloc: None }, + "490FB6014989C34D09D3F0450FB0190F85EFFFFFFF", + "atomically { 8_bits_at_[%r9]) Or= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + insns.push(( + Inst::AtomicRmwSeq { ty: types::I16, op: inst_common::AtomicRmwOp::And, srcloc: None }, + "490FB7014989C34D21D366F0450FB1190F85EEFFFFFF", + "atomically { 16_bits_at_[%r9]) And= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + insns.push(( + Inst::AtomicRmwSeq { ty: types::I32, op: inst_common::AtomicRmwOp::Xchg, srcloc: None }, + "418B014989C34D89D3F0450FB1190F85EFFFFFFF", + "atomically { 32_bits_at_[%r9]) Xchg= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + insns.push(( + Inst::AtomicRmwSeq { ty: types::I64, op: inst_common::AtomicRmwOp::Add, srcloc: None }, + "498B014989C34D01D3F04D0FB1190F85EFFFFFFF", + "atomically { 64_bits_at_[%r9]) Add= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" + )); + + // Fence + insns.push(( + Inst::Fence { + kind: FenceKind::MFence, + }, + "0FAEF0", + "mfence", + )); + insns.push(( + Inst::Fence { + kind: FenceKind::LFence, + }, + "0FAEE8", + "lfence", + )); + insns.push(( + Inst::Fence { + kind: FenceKind::SFence, + }, + "0FAEF8", + "sfence", + )); + // ======================================================== // Misc instructions. diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 712a9b508e..da2dca2060 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -404,6 +404,56 @@ pub enum Inst { offset: i64, }, + // ===================================== + // Instructions pertaining to atomic memory accesses. + /// A standard (native) `lock cmpxchg src, (amode)`, with register conventions: + /// + /// `dst` (read) address + /// `src` (read) replacement value + /// %rax (modified) in: expected value, out: value that was actually at `dst` + /// %rflags is written. Do not assume anything about it after the instruction. + /// + /// The instruction "succeeded" iff the lowest `ty` bits of %rax afterwards are the same as + /// they were before. + LockCmpxchg { + ty: Type, // I8, I16, I32 or I64 + src: Reg, + dst: SyntheticAmode, + srcloc: Option, + }, + + /// A synthetic instruction, based on a loop around a native `lock cmpxchg` instruction. + /// This atomically modifies a value in memory and returns the old value. The sequence + /// consists of an initial "normal" load from `dst`, followed by a loop which computes the + /// new value and tries to compare-and-swap ("CAS") it into `dst`, using the native + /// instruction `lock cmpxchg{b,w,l,q}` . The loop iterates until the CAS is successful. + /// If there is no contention, there will be only one pass through the loop body. The + /// sequence does *not* perform any explicit memory fence instructions + /// (mfence/sfence/lfence). + /// + /// Note that the transaction is atomic in the sense that, as observed by some other thread, + /// `dst` either has the initial or final value, but no other. It isn't atomic in the sense + /// of guaranteeing that no other thread writes to `dst` in between the initial load and the + /// CAS -- but that would cause the CAS to fail unless the other thread's last write before + /// the CAS wrote the same value that was already there. In other words, this + /// implementation suffers (unavoidably) from the A-B-A problem. + /// + /// This instruction sequence has fixed register uses as follows: + /// + /// %r9 (read) address + /// %r10 (read) second operand for `op` + /// %r11 (written) scratch reg; value afterwards has no meaning + /// %rax (written) the old value at %r9 + /// %rflags is written. Do not assume anything about it after the instruction. + AtomicRmwSeq { + ty: Type, // I8, I16, I32 or I64 + op: inst_common::AtomicRmwOp, + srcloc: Option, + }, + + /// A memory fence (mfence, lfence or sfence). + Fence { kind: FenceKind }, + // ===================================== // Meta-instructions generating no code. /// Marker, no-op in generated code: SP "virtual offset" is adjusted. This @@ -1521,6 +1571,26 @@ impl ShowWithRRU for Inst { show_ireg_sized(dst.to_reg(), mb_rru, 8), ), + Inst::LockCmpxchg { ty, src, dst, .. } => { + let size = ty.bytes() as u8; + format!("lock cmpxchg{} {}, {}", + suffixBWLQ(size), show_ireg_sized(*src, mb_rru, size), dst.show_rru(mb_rru)) + } + + Inst::AtomicRmwSeq { ty, op, .. } => { + format!( + "atomically {{ {}_bits_at_[%r9]) {:?}= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }}", + ty.bits(), op) + }, + + Inst::Fence { kind } => { + match kind { + FenceKind::MFence => "mfence".to_string(), + FenceKind::LFence => "lfence".to_string(), + FenceKind::SFence => "sfence".to_string(), + } + } + Inst::VirtualSPOffsetAdj { offset } => format!("virtual_sp_offset_adjust {}", offset), Inst::Hlt => "hlt".into(), @@ -1737,6 +1807,19 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(*dst); } + Inst::LockCmpxchg { src, dst, .. } => { + dst.get_regs_as_uses(collector); + collector.add_use(*src); + collector.add_mod(Writable::from_reg(regs::rax())); + } + + Inst::AtomicRmwSeq { .. } => { + collector.add_use(regs::r9()); + collector.add_use(regs::r10()); + collector.add_def(Writable::from_reg(regs::r11())); + collector.add_def(Writable::from_reg(regs::rax())); + } + Inst::Ret | Inst::EpiloguePlaceholder | Inst::JmpKnown { .. } @@ -1745,7 +1828,8 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { | Inst::TrapIf { .. } | Inst::VirtualSPOffsetAdj { .. } | Inst::Hlt - | Inst::Ud2 { .. } => { + | Inst::Ud2 { .. } + | Inst::Fence { .. } => { // No registers are used. } } @@ -2091,6 +2175,15 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { Inst::LoadExtName { ref mut dst, .. } => map_def(mapper, dst), + Inst::LockCmpxchg { + ref mut src, + ref mut dst, + .. + } => { + map_use(mapper, src); + dst.map_uses(mapper); + } + Inst::Ret | Inst::EpiloguePlaceholder | Inst::JmpKnown { .. } @@ -2099,8 +2192,11 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { | Inst::TrapIf { .. } | Inst::VirtualSPOffsetAdj { .. } | Inst::Ud2 { .. } - | Inst::Hlt => { - // No registers are used. + | Inst::Hlt + | Inst::AtomicRmwSeq { .. } + | Inst::Fence { .. } => { + // Instruction doesn't explicitly mention any regs, so it can't have any virtual + // regs that we'd need to remap. Hence no action required. } } } diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index f4eb306882..1b494db706 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2,6 +2,7 @@ #![allow(non_snake_case)] +use crate::ir; use crate::ir::{ condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Signature, TrapCode, Type, @@ -45,6 +46,14 @@ fn is_bool_ty(ty: Type) -> bool { } } +/// This is target-word-size dependent. And it excludes booleans and reftypes. +fn is_valid_atomic_transaction_ty(ty: Type) -> bool { + match ty { + types::I8 | types::I16 | types::I32 | types::I64 => true, + _ => false, + } +} + fn iri_to_u64_imm(ctx: Ctx, inst: IRInst) -> Option { ctx.get_constant(inst) } @@ -82,6 +91,13 @@ fn inst_fp_condcode(data: &InstructionData) -> Option { } } +fn inst_atomic_rmw_op(data: &InstructionData) -> Option { + match data { + &InstructionData::AtomicRmw { op, .. } => Some(op), + _ => None, + } +} + fn ldst_offset(data: &InstructionData) -> Option { match data { &InstructionData::Load { offset, .. } @@ -1732,6 +1748,148 @@ fn lower_insn_to_regs>( }); } + Opcode::AtomicRmw => { + // This is a simple, general-case atomic update, based on a loop involving + // `cmpxchg`. Note that we could do much better than this in the case where the old + // value at the location (that is to say, the SSA `Value` computed by this CLIF + // instruction) is not required. In that case, we could instead implement this + // using a single `lock`-prefixed x64 read-modify-write instruction. Also, even in + // the case where the old value is required, for the `add` and `sub` cases, we can + // use the single instruction `lock xadd`. However, those improvements have been + // left for another day. + // TODO: filed as https://github.com/bytecodealliance/wasmtime/issues/2153 + let dst = output_to_reg(ctx, outputs[0]); + let mut addr = input_to_reg(ctx, inputs[0]); + let mut arg2 = input_to_reg(ctx, inputs[1]); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + let memflags = ctx.memflags(insn).expect("memory flags"); + let srcloc = if !memflags.notrap() { + Some(ctx.srcloc(insn)) + } else { + None + }; + // Make sure that both args are in virtual regs, since in effect we have to do a + // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not + // guaranteed safe if either is in a real reg. + addr = ctx.ensure_in_vreg(addr, types::I64); + arg2 = ctx.ensure_in_vreg(arg2, types::I64); + // Move the args to the preordained AtomicRMW input regs. Note that `AtomicRmwSeq` + // operates at whatever width is specified by `ty`, so there's no need to + // zero-extend `arg2` in the case of `ty` being I8/I16/I32. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::r9()), + addr, + types::I64, + )); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::r10()), + arg2, + types::I64, + )); + // Now the AtomicRmwSeq (pseudo-) instruction itself + let op = inst_common::AtomicRmwOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap()); + ctx.emit(Inst::AtomicRmwSeq { + ty: ty_access, + op, + srcloc, + }); + // And finally, copy the preordained AtomicRmwSeq output reg to its destination. + ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); + } + + Opcode::AtomicCas => { + // This is very similar to, but not identical to, the `AtomicRmw` case. As with + // `AtomicRmw`, there's no need to zero-extend narrow values here. + let dst = output_to_reg(ctx, outputs[0]); + let addr = input_to_reg(ctx, inputs[0]); + let expected = input_to_reg(ctx, inputs[1]); + let replacement = input_to_reg(ctx, inputs[2]); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + let memflags = ctx.memflags(insn).expect("memory flags"); + let srcloc = if !memflags.notrap() { + Some(ctx.srcloc(insn)) + } else { + None + }; + // Move the expected value into %rax. Because there's only one fixed register on + // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the + // `AtomicRmw` case. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + expected, + types::I64, + )); + ctx.emit(Inst::LockCmpxchg { + ty: ty_access, + src: replacement, + dst: Amode::imm_reg(0, addr).into(), + srcloc, + }); + // And finally, copy the old value at the location to its destination reg. + ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); + } + + Opcode::AtomicLoad => { + // This is a normal load. The x86-TSO memory model provides sufficient sequencing + // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the + // need for any fence instructions. + let data = output_to_reg(ctx, outputs[0]); + let addr = input_to_reg(ctx, inputs[0]); + let ty_access = ty.unwrap(); + assert!(is_valid_atomic_transaction_ty(ty_access)); + let memflags = ctx.memflags(insn).expect("memory flags"); + let srcloc = if !memflags.notrap() { + Some(ctx.srcloc(insn)) + } else { + None + }; + // For the amode, we could do better, but for now just use `0(addr)`. + let rm = RegMem::mem(Amode::imm_reg(0, addr)); + if ty_access == types::I64 { + ctx.emit(Inst::mov64_rm_r(rm, data, srcloc)); + } else { + let ext_mode = match ty_access { + types::I8 => ExtMode::BQ, + types::I16 => ExtMode::WQ, + types::I32 => ExtMode::LQ, + _ => panic!("lowering AtomicLoad: invalid type"), + }; + ctx.emit(Inst::movzx_rm_r(ext_mode, rm, data, srcloc)); + } + } + + Opcode::AtomicStore => { + // This is a normal store, followed by an `mfence` instruction. + let data = input_to_reg(ctx, inputs[0]); + let addr = input_to_reg(ctx, inputs[1]); + let ty_access = ctx.input_ty(insn, 0); + assert!(is_valid_atomic_transaction_ty(ty_access)); + let memflags = ctx.memflags(insn).expect("memory flags"); + let srcloc = if !memflags.notrap() { + Some(ctx.srcloc(insn)) + } else { + None + }; + // For the amode, we could do better, but for now just use `0(addr)`. + ctx.emit(Inst::mov_r_m( + ty_access.bytes() as u8, + data, + Amode::imm_reg(0, addr), + srcloc, + )); + ctx.emit(Inst::Fence { + kind: FenceKind::MFence, + }); + } + + Opcode::Fence => { + ctx.emit(Inst::Fence { + kind: FenceKind::MFence, + }); + } + Opcode::FuncAddr => { let dst = output_to_reg(ctx, outputs[0]); let (extname, _) = ctx.call_target(insn).unwrap(); diff --git a/cranelift/codegen/src/machinst/inst_common.rs b/cranelift/codegen/src/machinst/inst_common.rs new file mode 100644 index 0000000000..9566c56e53 --- /dev/null +++ b/cranelift/codegen/src/machinst/inst_common.rs @@ -0,0 +1,36 @@ +//! A place to park MachInst::Inst fragments which are common across multiple architectures. + +use crate::ir; + +/// Atomic memory update operations. As of 21 Aug 2020 these are used for the aarch64 and x64 +/// targets. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(u8)] +pub enum AtomicRmwOp { + /// Add + Add, + /// Sub + Sub, + /// And + And, + /// Or + Or, + /// Exclusive Or + Xor, + /// Exchange (swap operands) + Xchg, +} + +impl AtomicRmwOp { + /// Converts an `ir::AtomicRmwOp` to the corresponding `inst_common::AtomicRmwOp`. + pub fn from(ir_op: ir::AtomicRmwOp) -> Self { + match ir_op { + ir::AtomicRmwOp::Add => AtomicRmwOp::Add, + ir::AtomicRmwOp::Sub => AtomicRmwOp::Sub, + ir::AtomicRmwOp::And => AtomicRmwOp::And, + ir::AtomicRmwOp::Or => AtomicRmwOp::Or, + ir::AtomicRmwOp::Xor => AtomicRmwOp::Xor, + ir::AtomicRmwOp::Xchg => AtomicRmwOp::Xchg, + } + } +} diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs index b8ec275133..915764436e 100644 --- a/cranelift/codegen/src/machinst/mod.rs +++ b/cranelift/codegen/src/machinst/mod.rs @@ -133,6 +133,8 @@ pub mod adapter; pub use adapter::*; pub mod helpers; pub use helpers::*; +pub mod inst_common; +pub use inst_common::*; /// A machine instruction. pub trait MachInst: Clone + Debug {