From 7c856542854bc8c5da9d5fb1a0b41f3c660d8484 Mon Sep 17 00:00:00 2001 From: Benjamin Bouvier Date: Fri, 21 Aug 2020 12:40:47 +0200 Subject: [PATCH] Address review comments. --- cranelift/codegen/src/inst_predicates.rs | 2 +- cranelift/codegen/src/isa/x64/inst/mod.rs | 7 + cranelift/codegen/src/isa/x64/lower.rs | 542 ++++++++++++---------- cranelift/codegen/src/machinst/lower.rs | 12 +- 4 files changed, 300 insertions(+), 263 deletions(-) diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs index 6830fcda1c..1aac4be2fd 100644 --- a/cranelift/codegen/src/inst_predicates.rs +++ b/cranelift/codegen/src/inst_predicates.rs @@ -43,7 +43,7 @@ pub fn has_side_effect(func: &Function, inst: Inst) -> bool { /// Does the given instruction have any side-effect as per [has_side_effect], or else is a load, /// but not the get_pinned_reg opcode? -pub fn has_side_effect_or_load_not_get_pinned_reg(func: &Function, inst: Inst) -> bool { +pub fn has_lowering_side_effect(func: &Function, inst: Inst) -> bool { let op = func.dfg[inst].opcode(); op != Opcode::GetPinnedReg && (has_side_effect(func, inst) || op.can_load()) } diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index ec52f79d5c..a39b0e6857 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -360,6 +360,13 @@ pub enum Inst { JmpKnown { dst: BranchTarget }, /// One-way conditional branch: jcond cond target. + /// + /// This instruction is useful when we have conditional jumps depending on more than two + /// conditions, see for instance the lowering of Brz/brnz with Fcmp inputs. + /// + /// A note of caution: in contexts where the branch target is another block, this has to be the + /// same successor as the one specified in the terminator branch of the current block. + /// Otherwise, this might confuse register allocation by creating new invisible edges. JmpIf { cc: CC, taken: BranchTarget }, /// Two-way conditional branch: jcond cond target target. diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 047eac99d8..c57cf40057 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -124,29 +124,37 @@ struct InsnOutput { output: usize, } +/// Returns whether the given specified `input` is a result produced by an instruction with Opcode +/// `op`. +// TODO investigate failures with checking against the result index. fn matches_input>( ctx: &mut C, input: InsnInput, op: Opcode, ) -> Option { let inputs = ctx.get_input(input.insn, input.input); - if let Some((src_inst, _)) = inputs.inst { + inputs.inst.and_then(|(src_inst, _)| { let data = ctx.data(src_inst); if data.opcode() == op { return Some(src_inst); } - } - None + None + }) +} + +fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg { + ctx.use_input_reg(input); + input.reg } /// Put the given input into a register, and mark it as used (side-effect). fn input_to_reg(ctx: Ctx, spec: InsnInput) -> Reg { - let inputs = ctx.get_input(spec.insn, spec.input); - ctx.use_input_reg(inputs); - inputs.reg + let input = ctx.get_input(spec.insn, spec.input); + lowerinput_to_reg(ctx, input) } /// An extension specification for `extend_input_to_reg`. +#[derive(Clone, Copy)] enum ExtSpec { ZeroExtendTo32, ZeroExtendTo64, @@ -163,6 +171,12 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg { }; let input_size = ctx.input_ty(spec.insn, spec.input).bits(); + let requested_ty = if requested_size == 32 { + types::I32 + } else { + types::I64 + }; + let ext_mode = match (input_size, requested_size) { (a, b) if a == b => return input_to_reg(ctx, spec), (a, 32) if a == 1 || a == 8 => ExtMode::BL, @@ -173,12 +187,6 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg { _ => unreachable!(), }; - let requested_ty = if requested_size == 32 { - types::I32 - } else { - types::I64 - }; - let src = input_to_reg_mem(ctx, spec); let dst = ctx.alloc_tmp(RegClass::I64, requested_ty); match ext_spec { @@ -196,21 +204,26 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg { dst.to_reg() } +fn lowerinput_to_reg_mem(ctx: Ctx, input: LowerInput) -> RegMem { + // TODO handle memory. + RegMem::reg(lowerinput_to_reg(ctx, input)) +} + /// Put the given input into a register or a memory operand. /// Effectful: may mark the given input as used, when returning the register form. fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem { - // TODO handle memory. - RegMem::reg(input_to_reg(ctx, spec)) + let input = ctx.get_input(spec.insn, spec.input); + lowerinput_to_reg_mem(ctx, input) } /// Returns whether the given input is an immediate that can be properly sign-extended, without any /// possible side-effect. -fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option { - ctx.get_input(spec.insn, spec.input).constant.and_then(|x| { +fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option { + input.constant.and_then(|x| { // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend // to 64 bits. For other sizes, it doesn't matter and we can just use the plain // constant. - if ctx.input_ty(spec.insn, spec.input).bytes() != 8 || low32_will_sign_extend_to_64(x) { + if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) { Some(x as u32) } else { None @@ -218,6 +231,12 @@ fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option { }) } +fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option { + let input = ctx.get_input(spec.insn, spec.input); + let input_ty = ctx.input_ty(spec.insn, spec.input); + lowerinput_to_sext_imm(input, input_ty) +} + fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option { ctx.get_input(spec.insn, spec.input).constant } @@ -225,9 +244,11 @@ fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option { /// Put the given input into an immediate, a register or a memory operand. /// Effectful: may mark the given input as used, when returning the register form. fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm { - match input_to_sext_imm(ctx, spec) { + let input = ctx.get_input(spec.insn, spec.input); + let input_ty = ctx.input_ty(spec.insn, spec.input); + match lowerinput_to_sext_imm(input, input_ty) { Some(x) => RegMemImm::imm(x), - None => match input_to_reg_mem(ctx, spec) { + None => match lowerinput_to_reg_mem(ctx, input) { RegMem::Reg { reg } => RegMemImm::reg(reg), RegMem::Mem { addr } => RegMemImm::mem(addr), }, @@ -252,34 +273,88 @@ fn emit_cmp(ctx: Ctx, insn: IRInst) { ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs)); } -#[derive(PartialEq)] -enum FcmpOperands { - Swap, - DontSwap, +/// A specification for a fcmp emission. +enum FcmpSpec { + /// Normal flow. + Normal, + + /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that + /// happens with `InvertedEqualOrConditions`. + /// + /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or + /// sequence of instructions) that check for an "AND" combination of condition codes; see for + /// instance lowering of Select. + InvertEqual, } -fn emit_fcmp(ctx: Ctx, insn: IRInst, swap_operands: FcmpOperands) { +/// This explains how to interpret the results of an fcmp instruction. +enum FcmpCondResult { + /// The given condition code must be set. + Condition(CC), + + /// Both condition codes must be set. + AndConditions(CC, CC), + + /// Either of the conditions codes must be set. + OrConditions(CC, CC), + + /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either + /// of the condition codes must be set, and the user must invert meaning of analyzing the + /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be + /// reached. + InvertedEqualOrConditions(CC, CC), +} + +fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult { + let (flip_operands, inverted_equal) = match cond_code { + FloatCC::LessThan + | FloatCC::LessThanOrEqual + | FloatCC::UnorderedOrGreaterThan + | FloatCC::UnorderedOrGreaterThanOrEqual => { + cond_code = cond_code.reverse(); + (true, false) + } + FloatCC::Equal => { + let inverted_equal = match spec { + FcmpSpec::Normal => false, + FcmpSpec::InvertEqual => { + cond_code = FloatCC::NotEqual; // same as .inverse() + true + } + }; + (false, inverted_equal) + } + _ => (false, false), + }; + // The only valid CC constructed with `from_floatcc` can be put in the flag // register with a direct float comparison; do this here. - let input_ty = ctx.input_ty(insn, 0); - let op = match input_ty { + let op = match ctx.input_ty(insn, 0) { types::F32 => SseOpcode::Ucomiss, types::F64 => SseOpcode::Ucomisd, _ => panic!("Bad input type to Fcmp"), }; + let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; - let (lhs, rhs) = if swap_operands == FcmpOperands::Swap { - ( - input_to_reg(ctx, inputs[1]), - input_to_reg_mem(ctx, inputs[0]), - ) + let (lhs_input, rhs_input) = if flip_operands { + (inputs[1], inputs[0]) } else { - ( - input_to_reg(ctx, inputs[0]), - input_to_reg_mem(ctx, inputs[1]), - ) + (inputs[0], inputs[1]) }; + let lhs = input_to_reg(ctx, lhs_input); + let rhs = input_to_reg_mem(ctx, rhs_input); ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs)); + + let cond_result = match cond_code { + FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z), + FloatCC::NotEqual if inverted_equal => { + FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ) + } + FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ), + _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)), + }; + + cond_result } fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature { @@ -350,33 +425,31 @@ fn emit_vm_call>( /// Returns whether the given input is a shift by a constant value less or equal than 3. /// The goal is to embed it within an address mode. -fn matches_small_cst_shift>( +fn matches_small_constant_shift>( ctx: &mut C, spec: InsnInput, ) -> Option<(InsnInput, u8)> { - if let Some(shift) = matches_input(ctx, spec, Opcode::Ishl) { - if let Some(shift_amt) = input_to_imm( + matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| { + match input_to_imm( ctx, InsnInput { insn: shift, input: 1, }, ) { - if shift_amt <= 3 { - return Some(( - InsnInput { - insn: shift, - input: 0, - }, - shift_amt as u8, - )); - } + Some(shift_amt) if shift_amt <= 3 => Some(( + InsnInput { + insn: shift, + input: 0, + }, + shift_amt as u8, + )), + _ => None, } - } - None + }) } -fn lower_amode>(ctx: &mut C, spec: InsnInput, offset: u32) -> Amode { +fn lower_to_amode>(ctx: &mut C, spec: InsnInput, offset: u32) -> Amode { // We now either have an add that we must materialize, or some other input; as well as the // final offset. if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) { @@ -394,14 +467,16 @@ fn lower_amode>(ctx: &mut C, spec: InsnInput, offset: u32) // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations // aren't happening in the wasm case. We could do better, given some range analysis. let (base, index, shift) = if let Some((shift_input, shift_amt)) = - matches_small_cst_shift(ctx, add_inputs[0]) + matches_small_constant_shift(ctx, add_inputs[0]) { ( input_to_reg(ctx, add_inputs[1]), input_to_reg(ctx, shift_input), shift_amt, ) - } else if let Some((shift_input, shift_amt)) = matches_small_cst_shift(ctx, add_inputs[1]) { + } else if let Some((shift_input, shift_amt)) = + matches_small_constant_shift(ctx, add_inputs[1]) + { ( input_to_reg(ctx, add_inputs[0]), input_to_reg(ctx, shift_input), @@ -1027,15 +1102,9 @@ fn lower_insn_to_regs>( } Opcode::Fcmp => { - let condcode = inst_fp_condcode(ctx.data(insn)); + let cond_code = inst_fp_condcode(ctx.data(insn)); let input_ty = ctx.input_ty(insn, 0); if !input_ty.is_vector() { - let op = match input_ty { - types::F32 => SseOpcode::Ucomiss, - types::F64 => SseOpcode::Ucomisd, - _ => panic!("Bad input type to fcmp: {}", input_ty), - }; - // Unordered is returned by setting ZF, PF, CF <- 111 // Greater than by ZF, PF, CF <- 000 // Less than by ZF, PF, CF <- 001 @@ -1051,71 +1120,35 @@ fn lower_insn_to_regs>( // set, then both the ZF and CF flag bits must also be set we can get away with using // one setcc for most condition codes. - match condcode { - FloatCC::LessThan - | FloatCC::LessThanOrEqual - | FloatCC::UnorderedOrGreaterThan - | FloatCC::UnorderedOrGreaterThanOrEqual => { - // setb and setbe for ordered LessThan and LessThanOrEqual check if CF = 1 - // which doesn't exclude unorderdness. To get around this we can reverse the - // operands and the cc test to instead check if CF and ZF are 0 which would - // also excludes unorderedness. Using similiar logic we also reverse - // UnorderedOrGreaterThan and UnorderedOrGreaterThanOrEqual and assure that ZF - // or CF is 1 to exclude orderedness. - let lhs = input_to_reg_mem(ctx, inputs[0]); - let rhs = input_to_reg(ctx, inputs[1]); - let dst = output_to_reg(ctx, outputs[0]); - ctx.emit(Inst::xmm_cmp_rm_r(op, lhs, rhs)); - let condcode = condcode.reverse(); - let cc = CC::from_floatcc(condcode); + let dst = output_to_reg(ctx, outputs[0]); + + match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { ctx.emit(Inst::setcc(cc, dst)); } - - FloatCC::Equal => { - // Outlier case: equal means both the operands are ordered and equal; we cannot - // get around checking the parity bit to determine if the result was ordered. - let lhs = input_to_reg(ctx, inputs[0]); - let rhs = input_to_reg_mem(ctx, inputs[1]); - let dst = output_to_reg(ctx, outputs[0]); - let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I32); - ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs)); - ctx.emit(Inst::setcc(CC::NP, tmp_gpr1)); - ctx.emit(Inst::setcc(CC::Z, dst)); + FcmpCondResult::AndConditions(cc1, cc2) => { + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, dst)); ctx.emit(Inst::alu_rmi_r( false, AluRmiROpcode::And, - RegMemImm::reg(tmp_gpr1.to_reg()), + RegMemImm::reg(tmp.to_reg()), dst, )); } - - FloatCC::NotEqual => { - // Outlier case: not equal means either the operands are unordered, or they're - // not the same value. - let lhs = input_to_reg(ctx, inputs[0]); - let rhs = input_to_reg_mem(ctx, inputs[1]); - let dst = output_to_reg(ctx, outputs[0]); - let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I32); - ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs)); - ctx.emit(Inst::setcc(CC::P, tmp_gpr1)); - ctx.emit(Inst::setcc(CC::NZ, dst)); + FcmpCondResult::OrConditions(cc1, cc2) => { + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, dst)); ctx.emit(Inst::alu_rmi_r( false, AluRmiROpcode::Or, - RegMemImm::reg(tmp_gpr1.to_reg()), + RegMemImm::reg(tmp.to_reg()), dst, )); } - - _ => { - // For all remaining condition codes we can handle things with one check. - let lhs = input_to_reg(ctx, inputs[0]); - let rhs = input_to_reg_mem(ctx, inputs[1]); - let dst = output_to_reg(ctx, outputs[0]); - let cc = CC::from_floatcc(condcode); - ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs)); - ctx.emit(Inst::setcc(cc, dst)); - } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), } } else { let op = match input_ty { @@ -1126,7 +1159,7 @@ fn lower_insn_to_regs>( // Since some packed comparisons are not available, some of the condition codes // must be inverted, with a corresponding `flip` of the operands. - let (imm, flip) = match condcode { + let (imm, flip) = match cond_code { FloatCC::GreaterThan => (FcmpImm::LessThan, true), FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true), FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true), @@ -1134,9 +1167,9 @@ fn lower_insn_to_regs>( (FcmpImm::UnorderedOrGreaterThanOrEqual, true) } FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => { - panic!("unsupported float condition code: {}", condcode) + panic!("unsupported float condition code: {}", cond_code) } - _ => (FcmpImm::from(condcode), false), + _ => (FcmpImm::from(cond_code), false), }; // Determine the operands of the comparison, possibly by flipping them. @@ -1225,35 +1258,77 @@ fn lower_insn_to_regs>( let srcloc = ctx.srcloc(insn); let trap_code = inst_trapcode(ctx.data(insn)).unwrap(); - let cc = if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() { - let condcode = inst_condcode(ctx.data(insn)); + if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() { + let cond_code = inst_condcode(ctx.data(insn)); // The flags must not have been clobbered by any other instruction between the // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can // simply use the flags here. - CC::from_intcc(condcode) + let cc = CC::from_intcc(cond_code); + + ctx.emit_safepoint(Inst::TrapIf { + trap_code, + srcloc, + cc, + }); } else if op == Opcode::Trapif { - let condcode = inst_condcode(ctx.data(insn)); - let cc = CC::from_intcc(condcode); + let cond_code = inst_condcode(ctx.data(insn)); + let cc = CC::from_intcc(cond_code); // Verification ensures that the input is always a single-def ifcmp. - let ifcmp_insn = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap(); - emit_cmp(ctx, ifcmp_insn); - cc + let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + emit_cmp(ctx, ifcmp); + + ctx.emit_safepoint(Inst::TrapIf { + trap_code, + srcloc, + cc, + }); } else { - let condcode = inst_fp_condcode(ctx.data(insn)); - let cc = CC::from_floatcc(condcode); + let cond_code = inst_fp_condcode(ctx.data(insn)); // Verification ensures that the input is always a single-def ffcmp. - let ffcmp_insn = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap(); - emit_fcmp(ctx, ffcmp_insn, FcmpOperands::DontSwap); - cc - }; + let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap(); - ctx.emit_safepoint(Inst::TrapIf { - trap_code, - srcloc, - cc, - }); + match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => ctx.emit_safepoint(Inst::TrapIf { + trap_code, + srcloc, + cc, + }), + FcmpCondResult::AndConditions(cc1, cc2) => { + // A bit unfortunate, but materialize the flags in their own register, and + // check against this. + let tmp = ctx.alloc_tmp(RegClass::I64, types::I32); + let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I32); + ctx.emit(Inst::setcc(cc1, tmp)); + ctx.emit(Inst::setcc(cc2, tmp2)); + ctx.emit(Inst::alu_rmi_r( + false, /* is_64 */ + AluRmiROpcode::And, + RegMemImm::reg(tmp.to_reg()), + tmp2, + )); + ctx.emit_safepoint(Inst::TrapIf { + trap_code, + srcloc, + cc: CC::NZ, + }); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + ctx.emit_safepoint(Inst::TrapIf { + trap_code, + srcloc, + cc: cc1, + }); + ctx.emit_safepoint(Inst::TrapIf { + trap_code, + srcloc, + cc: cc2, + }); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), + }; + }; } Opcode::F64const => { @@ -1751,7 +1826,7 @@ fn lower_insn_to_regs>( | Opcode::Uload32 | Opcode::Sload32 => { assert_eq!(inputs.len(), 1, "only one input for load operands"); - lower_amode(ctx, inputs[0], offset as u32) + lower_to_amode(ctx, inputs[0], offset as u32) } Opcode::LoadComplex @@ -1842,7 +1917,7 @@ fn lower_insn_to_regs>( let addr = match op { Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => { assert_eq!(inputs.len(), 2, "only one input for store memory operands"); - lower_amode(ctx, inputs[1], offset as u32) + lower_to_amode(ctx, inputs[1], offset as u32) } Opcode::StoreComplex @@ -1899,11 +1974,13 @@ fn lower_insn_to_regs>( } else { None }; + // Make sure that both args are in virtual regs, since in effect we have to do a // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not // guaranteed safe if either is in a real reg. addr = ctx.ensure_in_vreg(addr, types::I64); arg2 = ctx.ensure_in_vreg(arg2, types::I64); + // Move the args to the preordained AtomicRMW input regs. Note that `AtomicRmwSeq` // operates at whatever width is specified by `ty`, so there's no need to // zero-extend `arg2` in the case of `ty` being I8/I16/I32. @@ -1917,6 +1994,7 @@ fn lower_insn_to_regs>( arg2, types::I64, )); + // Now the AtomicRmwSeq (pseudo-) instruction itself let op = inst_common::AtomicRmwOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap()); ctx.emit(Inst::AtomicRmwSeq { @@ -1924,6 +2002,7 @@ fn lower_insn_to_regs>( op, srcloc, }); + // And finally, copy the preordained AtomicRmwSeq output reg to its destination. ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64)); } @@ -1932,7 +2011,7 @@ fn lower_insn_to_regs>( // This is very similar to, but not identical to, the `AtomicRmw` case. As with // `AtomicRmw`, there's no need to zero-extend narrow values here. let dst = output_to_reg(ctx, outputs[0]); - let addr = input_to_reg(ctx, inputs[0]); + let addr = lower_to_amode(ctx, inputs[0], 0); let expected = input_to_reg(ctx, inputs[1]); let replacement = input_to_reg(ctx, inputs[2]); let ty_access = ty.unwrap(); @@ -1943,6 +2022,7 @@ fn lower_insn_to_regs>( } else { None }; + // Move the expected value into %rax. Because there's only one fixed register on // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the // `AtomicRmw` case. @@ -1954,7 +2034,7 @@ fn lower_insn_to_regs>( ctx.emit(Inst::LockCmpxchg { ty: ty_access, src: replacement, - dst: Amode::imm_reg(0, addr).into(), + dst: addr.into(), srcloc, }); // And finally, copy the old value at the location to its destination reg. @@ -1966,7 +2046,7 @@ fn lower_insn_to_regs>( // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the // need for any fence instructions. let data = output_to_reg(ctx, outputs[0]); - let addr = input_to_reg(ctx, inputs[0]); + let addr = lower_to_amode(ctx, inputs[0], 0); let ty_access = ty.unwrap(); assert!(is_valid_atomic_transaction_ty(ty_access)); let memflags = ctx.memflags(insn).expect("memory flags"); @@ -1975,8 +2055,8 @@ fn lower_insn_to_regs>( } else { None }; - // For the amode, we could do better, but for now just use `0(addr)`. - let rm = RegMem::mem(Amode::imm_reg(0, addr)); + + let rm = RegMem::mem(addr); if ty_access == types::I64 { ctx.emit(Inst::mov64_rm_r(rm, data, srcloc)); } else { @@ -1993,7 +2073,7 @@ fn lower_insn_to_regs>( Opcode::AtomicStore => { // This is a normal store, followed by an `mfence` instruction. let data = input_to_reg(ctx, inputs[0]); - let addr = input_to_reg(ctx, inputs[1]); + let addr = lower_to_amode(ctx, inputs[1], 0); let ty_access = ctx.input_ty(insn, 0); assert!(is_valid_atomic_transaction_ty(ty_access)); let memflags = ctx.memflags(insn).expect("memory flags"); @@ -2002,13 +2082,8 @@ fn lower_insn_to_regs>( } else { None }; - // For the amode, we could do better, but for now just use `0(addr)`. - ctx.emit(Inst::mov_r_m( - ty_access.bytes() as u8, - data, - Amode::imm_reg(0, addr), - srcloc, - )); + + ctx.emit(Inst::mov_r_m(ty_access.bytes() as u8, data, addr, srcloc)); ctx.emit(Inst::Fence { kind: FenceKind::MFence, }); @@ -2068,81 +2143,36 @@ fn lower_insn_to_regs>( if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) { let cond_code = inst_fp_condcode(ctx.data(fcmp)); - // See comments in the lowering of Fcmp. - let (cond_code, swap_op, was_equal) = match cond_code { - FloatCC::LessThan - | FloatCC::LessThanOrEqual - | FloatCC::UnorderedOrGreaterThan - | FloatCC::UnorderedOrGreaterThanOrEqual => { - (cond_code.reverse(), FcmpOperands::Swap, false) - } - FloatCC::Equal => { - // Additionally, we invert Equal to NotEqual too: taking LHS if equal would - // mean take it if both CC::NP and CC::Z are set, the conjunction of which - // can't be modeled with a single cmov instruction. Instead, we'll swap LHS - // and RHS in the select operation, and invert the equal to a not-equal - // here. - (FloatCC::NotEqual, FcmpOperands::DontSwap, true) - } - _ => (cond_code, FcmpOperands::DontSwap, false), - }; - emit_fcmp(ctx, fcmp, swap_op); + // we request inversion of Equal to NotEqual here: taking LHS if equal would mean + // take it if both CC::NP and CC::Z are set, the conjunction of which can't be + // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the + // select operation, and invert the equal to a not-equal here. + let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual); - let (lhs, rhs) = if was_equal { - // See comment above about inverting conditional code. - ( - input_to_reg_mem(ctx, inputs[2]), - input_to_reg(ctx, inputs[1]), - ) - } else { - ( - input_to_reg_mem(ctx, inputs[1]), - input_to_reg(ctx, inputs[2]), - ) + let (lhs_input, rhs_input) = match fcmp_results { + FcmpCondResult::InvertedEqualOrConditions(_, _) => (inputs[2], inputs[1]), + FcmpCondResult::Condition(_) + | FcmpCondResult::AndConditions(_, _) + | FcmpCondResult::OrConditions(_, _) => (inputs[1], inputs[2]), }; - let dst = output_to_reg(ctx, outputs[0]); - let ty = ctx.output_ty(insn, 0); - - let lhs = if is_int_ty(ty) { - let size = ty.bytes() as u8; - if size == 1 { - // Sign-extend operands to 32, then do a cmove of size 4. - let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32); - ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se, None)); - ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst, None)); - RegMem::reg(lhs_se.to_reg()) - } else { - ctx.emit(Inst::gen_move(dst, rhs, ty)); - lhs - } + let rhs = input_to_reg(ctx, rhs_input); + let dst = output_to_reg(ctx, outputs[0]); + let lhs = if is_int_ty(ty) && ty.bytes() < 4 { + // Special case: since the higher bits are undefined per CLIF semantics, we + // can just apply a 32-bit cmove here. Force inputs into registers, to + // avoid partial spilling out-of-bounds with memory accesses, though. + // Sign-extend operands to 32, then do a cmove of size 4. + RegMem::reg(input_to_reg(ctx, lhs_input)) } else { - debug_assert!(ty == types::F32 || ty == types::F64); - ctx.emit(Inst::gen_move(dst, rhs, ty)); - lhs + input_to_reg_mem(ctx, lhs_input) }; - match cond_code { - FloatCC::Equal => { - // See comment above about inverting conditional code. - panic!("can't happen because of above guard"); - } + ctx.emit(Inst::gen_move(dst, rhs, ty)); - FloatCC::NotEqual => { - // Take lhs if not-equal, that is CC::P or CC:NZ. - if is_int_ty(ty) { - let size = u8::max(ty.bytes() as u8, 4); - ctx.emit(Inst::cmove(size, CC::P, lhs.clone(), dst)); - ctx.emit(Inst::cmove(size, CC::NZ, lhs, dst)); - } else { - ctx.emit(Inst::xmm_cmove(ty == types::F64, CC::P, lhs.clone(), dst)); - ctx.emit(Inst::xmm_cmove(ty == types::F64, CC::NZ, lhs, dst)); - } - } - - _ => { - let cc = CC::from_floatcc(cond_code); + match fcmp_results { + FcmpCondResult::Condition(cc) => { if is_int_ty(ty) { let size = u8::max(ty.bytes() as u8, 4); ctx.emit(Inst::cmove(size, cc, lhs, dst)); @@ -2150,6 +2180,22 @@ fn lower_insn_to_regs>( ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); } } + FcmpCondResult::AndConditions(_, _) => { + unreachable!( + "can't AND with select; see above comment about inverting equal" + ); + } + FcmpCondResult::InvertedEqualOrConditions(cc1, cc2) + | FcmpCondResult::OrConditions(cc1, cc2) => { + if is_int_ty(ty) { + let size = u8::max(ty.bytes() as u8, 4); + ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst)); + ctx.emit(Inst::cmove(size, cc2, lhs, dst)); + } else { + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst)); + ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst)); + } + } } } else { let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) { @@ -2164,27 +2210,27 @@ fn lower_insn_to_regs>( CC::NZ }; - let lhs = input_to_reg_mem(ctx, inputs[1]); let rhs = input_to_reg(ctx, inputs[2]); let dst = output_to_reg(ctx, outputs[0]); - let ty = ctx.output_ty(insn, 0); + ctx.emit(Inst::gen_move(dst, rhs, ty)); + if is_int_ty(ty) { - let size = ty.bytes() as u8; - if size == 1 { - // Sign-extend operands to 32, then do a cmove of size 4. - let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32); - ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se, None)); - ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst, None)); - ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst)); + let mut size = ty.bytes() as u8; + let lhs = if size < 4 { + // Special case: since the higher bits are undefined per CLIF semantics, we + // can just apply a 32-bit cmove here. Force inputs into registers, to + // avoid partial spilling out-of-bounds with memory accesses, though. + size = 4; + RegMem::reg(input_to_reg(ctx, inputs[1])) } else { - ctx.emit(Inst::gen_move(dst, rhs, ty)); - ctx.emit(Inst::cmove(size, cc, lhs, dst)); - } + input_to_reg_mem(ctx, inputs[1]) + }; + ctx.emit(Inst::cmove(size, cc, lhs, dst)); } else { debug_assert!(ty == types::F32 || ty == types::F64); - ctx.emit(Inst::gen_move(dst, rhs, ty)); + let lhs = input_to_reg_mem(ctx, inputs[1]); ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); } } @@ -2464,47 +2510,29 @@ impl LowerBackend for X64Backend { } else { cond_code }; + let cc = CC::from_intcc(cond_code); ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) { let cond_code = inst_fp_condcode(ctx.data(fcmp)); - let cond_code = if op0 == Opcode::Brz { cond_code.inverse() } else { cond_code }; - - // See comments in the lowering of Fcmp. - let (cond_code, swap_op) = match cond_code { - FloatCC::LessThan - | FloatCC::LessThanOrEqual - | FloatCC::UnorderedOrGreaterThan - | FloatCC::UnorderedOrGreaterThanOrEqual => { - (cond_code.reverse(), FcmpOperands::Swap) - } - _ => (cond_code, FcmpOperands::DontSwap), - }; - emit_fcmp(ctx, fcmp, swap_op); - - match cond_code { - FloatCC::Equal => { - // Jump to taken if CC::NP and CC::Z, that is, jump to not-taken if - // CC::P or CC::NZ. - ctx.emit(Inst::jmp_if(CC::P, not_taken)); - ctx.emit(Inst::jmp_cond(CC::NZ, not_taken, taken)); - } - - FloatCC::NotEqual => { - // Jump to taken if CC::P or CC::NZ. - ctx.emit(Inst::jmp_if(CC::P, taken)); - ctx.emit(Inst::jmp_cond(CC::NZ, taken, not_taken)); - } - - _ => { - let cc = CC::from_floatcc(cond_code); + match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) { + FcmpCondResult::Condition(cc) => { ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); } + FcmpCondResult::AndConditions(cc1, cc2) => { + ctx.emit(Inst::jmp_if(cc1.invert(), not_taken)); + ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken)); + } + FcmpCondResult::OrConditions(cc1, cc2) => { + ctx.emit(Inst::jmp_if(cc1, taken)); + ctx.emit(Inst::jmp_cond(cc2, taken, not_taken)); + } + FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), } } else if is_int_ty(src_ty) || is_bool_ty(src_ty) { let src = input_to_reg( diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index b421f79254..9ec313916e 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -4,7 +4,7 @@ use crate::entity::SecondaryMap; use crate::fx::{FxHashMap, FxHashSet}; -use crate::inst_predicates::{has_side_effect_or_load_not_get_pinned_reg, is_constant_64bit}; +use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit}; use crate::ir::instructions::BranchInfo; use crate::ir::types::I64; use crate::ir::{ @@ -372,7 +372,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> { for bb in f.layout.blocks() { cur_color += 1; for inst in f.layout.block_insts(bb) { - let side_effect = has_side_effect_or_load_not_get_pinned_reg(f, inst); + let side_effect = has_lowering_side_effect(f, inst); // Assign colors. A new color is chosen *after* any side-effecting instruction. inst_colors[inst] = InstColor::new(cur_color); @@ -799,15 +799,15 @@ impl<'func, I: VCodeInst> Lower<'func, I> { ValueDef::Result(src_inst, result_idx) => { debug!(" -> src inst {}", src_inst); debug!( - " -> has side effect: {}", - has_side_effect_or_load_not_get_pinned_reg(self.f, src_inst) + " -> has lowering side effect: {}", + has_lowering_side_effect(self.f, src_inst) ); debug!( " -> our color is {:?}, src inst is {:?}", self.inst_color(at_inst), self.inst_color(src_inst) ); - if !has_side_effect_or_load_not_get_pinned_reg(self.f, src_inst) + if !has_lowering_side_effect(self.f, src_inst) || self.inst_color(at_inst) == self.inst_color(src_inst) { Some((src_inst, result_idx)) @@ -989,6 +989,8 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { fn use_input_reg(&mut self, input: LowerInput) { debug!("use_input_reg: vreg {:?} is needed", input.reg); + // We may directly return a real (machine) register when we know that register holds the + // result of an opcode (e.g. GetPinnedReg). if input.reg.is_virtual() { self.vreg_needed[input.reg.get_index()] = true; }