diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 5d4c409fda..ec58615773 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1168,7 +1168,11 @@ impl Inst { ) -> Inst { let rc = from_reg.get_class(); match rc { - RegClass::I64 => Inst::mov_r_m(ty.bytes() as u8, from_reg, to_addr, srcloc), + RegClass::I64 => { + // Always store the full register, to ensure that the high bits are properly set + // when doing a full reload. + Inst::mov_r_m(8 /* bytes */, from_reg, to_addr, srcloc) + } RegClass::V128 => { let opcode = match ty { types::F32 => SseOpcode::Movss, diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 1da69c4d30..9ae0149565 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -70,6 +70,25 @@ fn matches_input>( }) } +/// Returns whether the given specified `input` is a result produced by an instruction with any of +/// the opcodes specified in `ops`. +fn matches_input_any>( + ctx: &mut C, + input: InsnInput, + ops: &[Opcode], +) -> Option { + let inputs = ctx.get_input(input.insn, input.input); + inputs.inst.and_then(|(src_inst, _)| { + let data = ctx.data(src_inst); + for &op in ops { + if data.opcode() == op { + return Some(src_inst); + } + } + None + }) +} + fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg { ctx.use_input_reg(input); input.reg @@ -1339,29 +1358,55 @@ fn lower_insn_to_regs>( let src_ty = ctx.input_ty(insn, 0); let dst_ty = ctx.output_ty(insn, 0); + // Sextend requires a sign-extended move, but all the other opcodes are simply a move + // from a zero-extended source. Here is why this works, in each case: + // + // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to + // zero-extend here. + // + // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so + // again, this is a zero-extend / no-op. + // + // - Ireduce: changing width of an integer. Smaller ints are stored with undefined + // high-order bits, so we can simply do a copy. + + if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend { + // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on + // 32-bits will zero-extend the upper 32-bits, so we can even not generate a + // zero-extended move in this case. + // TODO add loads and shifts here. + if let Some(_) = matches_input_any( + ctx, + inputs[0], + &[ + Opcode::Iadd, + Opcode::IaddIfcout, + Opcode::Isub, + Opcode::Imul, + Opcode::Band, + Opcode::Bor, + Opcode::Bxor, + ], + ) { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst, src, types::I64)); + return Ok(()); + } + } + let src = input_to_reg_mem(ctx, inputs[0]); let dst = get_output_reg(ctx, outputs[0]); let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits()); - assert!( - (src_ty.bits() < dst_ty.bits() && ext_mode.is_some()) || ext_mode.is_none(), + assert_eq!( + src_ty.bits() < dst_ty.bits(), + ext_mode.is_some(), "unexpected extension: {} -> {}", src_ty, dst_ty ); - // All of these other opcodes are simply a move from a zero-extended source. Here - // is why this works, in each case: - // - // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we - // merely need to zero-extend here. - // - // - Breduce, Bextend: changing width of a boolean. We represent a - // bool as a 0 or 1, so again, this is a zero-extend / no-op. - // - // - Ireduce: changing width of an integer. Smaller ints are stored - // with undefined high-order bits, so we can simply do a copy. - if let Some(ext_mode) = ext_mode { if op == Opcode::Sextend { ctx.emit(Inst::movsx_rm_r( diff --git a/cranelift/filetests/filetests/isa/x64/uextend-elision.clif b/cranelift/filetests/filetests/isa/x64/uextend-elision.clif new file mode 100644 index 0000000000..aed6068d42 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/uextend-elision.clif @@ -0,0 +1,17 @@ +test compile +target x86_64 +feature "experimental_x64" + +function %elide_uextend_add(i32, i32) -> i64 { +block0(v0: i32, v1: i32): + ; check: pushq %rbp + ; check: movq %rsp, %rbp + v2 = iadd v0, v1 + ; check: addl %esi, %edi + v3 = uextend.i64 v2 + ; check: movq %rdi, %rax + ; check: movq %rbp, %rsp + ; check: popq %rbp + ; check: ret + return v3 +}