diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index 74dca6c3ec..aa757392e3 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -138,42 +138,62 @@ impl ABIMachineSpec for X64ABIMachineSpec { ), } - let intreg = in_int_reg(param.value_type); - let vecreg = in_vec_reg(param.value_type); - debug_assert!(intreg || vecreg); - debug_assert!(!(intreg && vecreg)); - - let (next_reg, candidate) = if intreg { - let candidate = match args_or_rets { - ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr), - ArgsOrRets::Rets => get_intreg_for_retval_systemv(&call_conv, next_gpr, i), - }; - debug_assert!(candidate - .map(|r| r.get_class() == RegClass::I64) - .unwrap_or(true)); - (&mut next_gpr, candidate) - } else { - let candidate = match args_or_rets { - ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg), - ArgsOrRets::Rets => get_fltreg_for_retval_systemv(&call_conv, next_vreg, i), - }; - debug_assert!(candidate - .map(|r| r.get_class() == RegClass::V128) - .unwrap_or(true)); - (&mut next_vreg, candidate) - }; - if let Some(param) = try_fill_baldrdash_reg(call_conv, param) { - assert!(intreg); ret.push(param); - } else if let Some(reg) = candidate { + continue; + } + + // Find regclass(es) of the register(s) used to store a value of this type. + let (rcs, _) = Inst::rc_for_type(param.value_type)?; + let intreg = rcs[0] == RegClass::I64; + let num_regs = rcs.len(); + assert!(num_regs <= 2); + if num_regs == 2 { + assert_eq!(rcs[0], rcs[1]); + } + + let mut regs: SmallVec<[RealReg; 2]> = smallvec![]; + for j in 0..num_regs { + let nextreg = if intreg { + match args_or_rets { + ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr + j), + ArgsOrRets::Rets => { + get_intreg_for_retval_systemv(&call_conv, next_gpr + j, i + j) + } + } + } else { + match args_or_rets { + ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg + j), + ArgsOrRets::Rets => { + get_fltreg_for_retval_systemv(&call_conv, next_vreg + j, i + j) + } + } + }; + if let Some(reg) = nextreg { + regs.push(reg.to_real_reg()); + } else { + regs.clear(); + break; + } + } + + if regs.len() > 0 { + let regs = match num_regs { + 1 => ValueRegs::one(regs[0]), + 2 => ValueRegs::two(regs[0], regs[1]), + _ => panic!("More than two registers unexpected"), + }; ret.push(ABIArg::Reg( - ValueRegs::one(reg.to_real_reg()), + regs, param.value_type, param.extension, param.purpose, )); - *next_reg += 1; + if intreg { + next_gpr += num_regs; + } else { + next_vreg += num_regs; + } } else { // Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte // stack alignment happens separately after all args.) @@ -658,31 +678,6 @@ impl From for SyntheticAmode { } } -fn in_int_reg(ty: types::Type) -> bool { - match ty { - types::I8 - | types::I16 - | types::I32 - | types::I64 - | types::B1 - | types::B8 - | types::B16 - | types::B32 - | types::B64 - | types::R64 => true, - types::R32 => panic!("unexpected 32-bits refs on x64!"), - _ => false, - } -} - -fn in_vec_reg(ty: types::Type) -> bool { - match ty { - types::F32 | types::F64 => true, - _ if ty.is_vector() => true, - _ => false, - } -} - fn get_intreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option { match call_conv { CallConv::Fast diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 898134644f..39ca25d060 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -346,23 +346,35 @@ impl PrettyPrintSized for RegMem { #[derive(Copy, Clone, PartialEq)] pub enum AluRmiROpcode { Add, + Adc, Sub, + Sbb, And, Or, Xor, /// The signless, non-extending (N x N -> N, for N in {32,64}) variant. Mul, + /// 8-bit form of And. Handled separately as we don't have full 8-bit op + /// support (we just use wider instructions). Used only with some sequences + /// with SETcc. + And8, + /// 8-bit form of Or. + Or8, } impl fmt::Debug for AluRmiROpcode { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { let name = match self { AluRmiROpcode::Add => "add", + AluRmiROpcode::Adc => "adc", AluRmiROpcode::Sub => "sub", + AluRmiROpcode::Sbb => "sbb", AluRmiROpcode::And => "and", AluRmiROpcode::Or => "or", AluRmiROpcode::Xor => "xor", AluRmiROpcode::Mul => "imul", + AluRmiROpcode::And8 => "and", + AluRmiROpcode::Or8 => "or", }; write!(fmt, "{}", name) } @@ -374,6 +386,16 @@ impl fmt::Display for AluRmiROpcode { } } +impl AluRmiROpcode { + /// Is this a special-cased 8-bit ALU op? + pub fn is_8bit(self) -> bool { + match self { + AluRmiROpcode::And8 | AluRmiROpcode::Or8 => true, + _ => false, + } + } +} + #[derive(Clone, PartialEq)] pub enum UnaryRmROpcode { /// Bit-scan reverse. @@ -1010,7 +1032,7 @@ impl fmt::Display for ExtMode { } /// These indicate the form of a scalar shift/rotate: left, signed right, unsigned right. -#[derive(Clone)] +#[derive(Clone, Copy)] pub enum ShiftKind { ShiftLeft, /// Inserts zeros in the most significant bits. diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 580d469b8d..075724d493 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -83,6 +83,14 @@ impl RexFlags { self } + #[inline(always)] + fn always_emit_if_8bit_needed(&mut self, reg: u8) -> &mut Self { + if reg >= 4 && reg <= 7 { + self.always_emit(); + } + self + } + #[inline(always)] fn must_clear_w(&self) -> bool { (self.0 & 1) != 0 @@ -527,7 +535,7 @@ pub(crate) fn emit( src, dst: reg_g, } => { - let rex = if *is_64 { + let mut rex = if *is_64 { RexFlags::set_w() } else { RexFlags::clear_w() @@ -581,17 +589,26 @@ pub(crate) fn emit( } } } else { - let (opcode_r, opcode_m, subopcode_i) = match op { - AluRmiROpcode::Add => (0x01, 0x03, 0), - AluRmiROpcode::Sub => (0x29, 0x2B, 5), - AluRmiROpcode::And => (0x21, 0x23, 4), - AluRmiROpcode::Or => (0x09, 0x0B, 1), - AluRmiROpcode::Xor => (0x31, 0x33, 6), + let (opcode_r, opcode_m, subopcode_i, is_8bit) = match op { + AluRmiROpcode::Add => (0x01, 0x03, 0, false), + AluRmiROpcode::Adc => (0x11, 0x03, 0, false), + AluRmiROpcode::Sub => (0x29, 0x2B, 5, false), + AluRmiROpcode::Sbb => (0x19, 0x2B, 5, false), + AluRmiROpcode::And => (0x21, 0x23, 4, false), + AluRmiROpcode::Or => (0x09, 0x0B, 1, false), + AluRmiROpcode::Xor => (0x31, 0x33, 6, false), + AluRmiROpcode::And8 => (0x20, 0x22, 4, true), + AluRmiROpcode::Or8 => (0x08, 0x0A, 1, true), AluRmiROpcode::Mul => panic!("unreachable"), }; + assert!(!(is_8bit && *is_64)); match src { RegMemImm::Reg { reg: reg_e } => { + if is_8bit { + rex.always_emit_if_8bit_needed(int_reg_enc(*reg_e)); + rex.always_emit_if_8bit_needed(int_reg_enc(reg_g.to_reg())); + } // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R // duality). Do this too, so as to be able to compare generated machine // code easily. @@ -604,11 +621,12 @@ pub(crate) fn emit( reg_g.to_reg(), rex, ); - // NB: if this is ever extended to handle byte size ops, be sure to retain - // redundant REX prefixes. } RegMemImm::Mem { addr } => { + if is_8bit { + rex.always_emit_if_8bit_needed(int_reg_enc(reg_g.to_reg())); + } // Here we revert to the "normal" G-E ordering. let amode = addr.finalize(state, sink); emit_std_reg_mem( @@ -625,6 +643,7 @@ pub(crate) fn emit( } RegMemImm::Imm { simm32 } => { + assert!(!is_8bit); let use_imm8 = low8_will_sign_extend_to_32(*simm32); let opcode = if use_imm8 { 0x83 } else { 0x81 }; // And also here we use the "normal" G-E ordering. @@ -685,8 +704,13 @@ pub(crate) fn emit( } Inst::Not { size, src } => { + let src = int_reg_enc(src.to_reg()); let (opcode, prefix, rex_flags) = match size { - 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()), + 1 => ( + 0xF6, + LegacyPrefixes::None, + *RexFlags::clear_w().always_emit_if_8bit_needed(src), + ), 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()), 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()), 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()), @@ -694,13 +718,17 @@ pub(crate) fn emit( }; let subopcode = 2; - let src = int_reg_enc(src.to_reg()); emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags) } Inst::Neg { size, src } => { + let src = int_reg_enc(src.to_reg()); let (opcode, prefix, rex_flags) = match size { - 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()), + 1 => ( + 0xF6, + LegacyPrefixes::None, + *RexFlags::clear_w().always_emit_if_8bit_needed(src), + ), 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()), 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()), 8 => (0xF7, LegacyPrefixes::None, RexFlags::set_w()), @@ -708,7 +736,6 @@ pub(crate) fn emit( }; let subopcode = 3; - let src = int_reg_enc(src.to_reg()); emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags) } @@ -717,7 +744,7 @@ pub(crate) fn emit( signed, divisor, } => { - let (opcode, prefix, rex_flags) = match size { + let (opcode, prefix, mut rex_flags) = match size { 1 => (0xF6, LegacyPrefixes::None, RexFlags::clear_w()), 2 => (0xF7, LegacyPrefixes::_66, RexFlags::clear_w()), 4 => (0xF7, LegacyPrefixes::None, RexFlags::clear_w()), @@ -732,6 +759,9 @@ pub(crate) fn emit( match divisor { RegMem::Reg { reg } => { let src = int_reg_enc(*reg); + if *size == 1 { + rex_flags.always_emit_if_8bit_needed(src); + } emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, src, rex_flags) } RegMem::Mem { addr: src } => { @@ -987,9 +1017,7 @@ pub(crate) fn emit( ExtMode::BL | ExtMode::BQ => { // A redundant REX prefix must be emitted for certain register inputs. let enc_src = int_reg_enc(*src); - if enc_src >= 4 && enc_src <= 7 { - rex_flags.always_emit(); - }; + rex_flags.always_emit_if_8bit_needed(enc_src); } _ => {} } @@ -1084,9 +1112,7 @@ pub(crate) fn emit( ExtMode::BL | ExtMode::BQ => { // A redundant REX prefix must be emitted for certain register inputs. let enc_src = int_reg_enc(*src); - if enc_src >= 4 && enc_src <= 7 { - rex_flags.always_emit(); - }; + rex_flags.always_emit_if_8bit_needed(enc_src); } _ => {} } @@ -1130,9 +1156,7 @@ pub(crate) fn emit( let mut rex = RexFlags::clear_w(); let enc_src = int_reg_enc(*src); - if enc_src >= 4 && enc_src <= 7 { - rex.always_emit(); - }; + rex.always_emit_if_8bit_needed(enc_src); // MOV r8, r/m8 is (REX.W==0) 88 /r emit_std_reg_mem( @@ -1215,7 +1239,11 @@ pub(crate) fn emit( match num_bits { None => { let (opcode, prefix, rex_flags) = match size { - 1 => (0xD2, LegacyPrefixes::None, RexFlags::clear_w()), + 1 => ( + 0xD2, + LegacyPrefixes::None, + *RexFlags::clear_w().always_emit_if_8bit_needed(enc_dst), + ), 2 => (0xD3, LegacyPrefixes::_66, RexFlags::clear_w()), 4 => (0xD3, LegacyPrefixes::None, RexFlags::clear_w()), 8 => (0xD3, LegacyPrefixes::None, RexFlags::set_w()), @@ -1231,7 +1259,11 @@ pub(crate) fn emit( Some(num_bits) => { let (opcode, prefix, rex_flags) = match size { - 1 => (0xC0, LegacyPrefixes::None, RexFlags::clear_w()), + 1 => ( + 0xC0, + LegacyPrefixes::None, + *RexFlags::clear_w().always_emit_if_8bit_needed(enc_dst), + ), 2 => (0xC1, LegacyPrefixes::_66, RexFlags::clear_w()), 4 => (0xC1, LegacyPrefixes::None, RexFlags::clear_w()), 8 => (0xC1, LegacyPrefixes::None, RexFlags::set_w()), @@ -1330,9 +1362,7 @@ pub(crate) fn emit( let mut rex = RexFlags::clear_w(); // Here, a redundant REX prefix changes the meaning of the instruction. let enc_g = int_reg_enc(*reg_g); - if enc_g >= 4 && enc_g <= 7 { - rex.always_emit(); - } + rex.always_emit_if_8bit_needed(enc_g); rex } _ => panic!("x64::Inst::Cmp_RMI_R::emit: unreachable"), @@ -1343,9 +1373,7 @@ pub(crate) fn emit( if *size == 1 { // Check whether the E register forces the use of a redundant REX. let enc_e = int_reg_enc(*reg_e); - if enc_e >= 4 && enc_e <= 7 { - rex.always_emit(); - } + rex.always_emit_if_8bit_needed(enc_e); } // Use the swapped operands encoding for CMP, to stay consistent with the output of @@ -2761,9 +2789,7 @@ pub(crate) fn emit( types::I8 => { let mut rex_flags = RexFlags::clear_w(); let enc_src = int_reg_enc(*src); - if enc_src >= 4 && enc_src <= 7 { - rex_flags.always_emit(); - }; + rex_flags.always_emit_if_8bit_needed(enc_src); (LegacyPrefixes::_F0, rex_flags, 0x0FB0) } types::I16 => (LegacyPrefixes::_66F0, RexFlags::clear_w(), 0x0FB1), diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index c3489089b9..42e38c9cd5 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -1025,6 +1025,56 @@ fn test_x64_emit() { "4C09FA", "orq %r15, %rdx", )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::And8, RegMemImm::reg(r15), w_rdx), + "4420FA", + "andb %r15b, %dl", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::And8, RegMemImm::reg(rax), w_rsi), + "4020C6", + "andb %al, %sil", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::And8, RegMemImm::reg(rax), w_rbx), + "20C3", + "andb %al, %bl", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::And8, + RegMemImm::mem(Amode::imm_reg(0, rax)), + w_rbx, + ), + "2218", + "andb 0(%rax), %bl", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Or8, RegMemImm::reg(r15), w_rdx), + "4408FA", + "orb %r15b, %dl", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Or8, RegMemImm::reg(rax), w_rsi), + "4008C6", + "orb %al, %sil", + )); + insns.push(( + Inst::alu_rmi_r(false, AluRmiROpcode::Or8, RegMemImm::reg(rax), w_rbx), + "08C3", + "orb %al, %bl", + )); + insns.push(( + Inst::alu_rmi_r( + false, + AluRmiROpcode::Or8, + RegMemImm::mem(Amode::imm_reg(0, rax)), + w_rbx, + ), + "0A18", + "orb 0(%rax), %bl", + )); insns.push(( Inst::alu_rmi_r(true, AluRmiROpcode::Xor, RegMemImm::reg(r15), w_rdx), "4C31FA", @@ -1193,6 +1243,16 @@ fn test_x64_emit() { "66F7D7", "notw %di", )); + insns.push(( + Inst::not(1, Writable::from_reg(regs::rdi())), + "40F6D7", + "notb %dil", + )); + insns.push(( + Inst::not(1, Writable::from_reg(regs::rax())), + "F6D0", + "notb %al", + )); // ======================================================== // Neg @@ -1216,6 +1276,16 @@ fn test_x64_emit() { "66F7DF", "negw %di", )); + insns.push(( + Inst::neg(1, Writable::from_reg(regs::rdi())), + "40F6DF", + "negb %dil", + )); + insns.push(( + Inst::neg(1, Writable::from_reg(regs::rax())), + "F6D8", + "negb %al", + )); // ======================================================== // Div @@ -1239,6 +1309,16 @@ fn test_x64_emit() { "48F7F7", "div %rdi", )); + insns.push(( + Inst::div(1, false, RegMem::reg(regs::rax())), + "F6F0", + "div %al", + )); + insns.push(( + Inst::div(1, false, RegMem::reg(regs::rsi())), + "40F6F6", + "div %sil", + )); // ======================================================== // MulHi @@ -2352,9 +2432,14 @@ fn test_x64_emit() { )); insns.push(( Inst::shift_r(1, ShiftKind::RotateRight, None, w_rsi), - "D2CE", + "40D2CE", "rorb %cl, %sil", )); + insns.push(( + Inst::shift_r(1, ShiftKind::RotateRight, None, w_rax), + "D2C8", + "rorb %cl, %al", + )); insns.push(( Inst::shift_r(1, ShiftKind::RotateRight, Some(5), w_r15), "41C0CF05", diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 09c469498d..979c264231 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1243,6 +1243,14 @@ impl PrettyPrint for Inst { (if is_64 { "q" } else { "l" }).to_string() } + fn suffix_lqb(is_64: bool, is_8: bool) -> String { + match (is_64, is_8) { + (_, true) => "b".to_string(), + (true, false) => "q".to_string(), + (false, false) => "l".to_string(), + } + } + fn size_lq(is_64: bool) -> u8 { if is_64 { 8 @@ -1251,6 +1259,16 @@ impl PrettyPrint for Inst { } } + fn size_lqb(is_64: bool, is_8: bool) -> u8 { + if is_8 { + 1 + } else if is_64 { + 8 + } else { + 4 + } + } + fn suffix_bwlq(size: u8) -> String { match size { 1 => "b".to_string(), @@ -1271,9 +1289,9 @@ impl PrettyPrint for Inst { dst, } => format!( "{} {}, {}", - ljustify2(op.to_string(), suffix_lq(*is_64)), - src.show_rru_sized(mb_rru, size_lq(*is_64)), - show_ireg_sized(dst.to_reg(), mb_rru, size_lq(*is_64)), + ljustify2(op.to_string(), suffix_lqb(*is_64, op.is_8bit())), + src.show_rru_sized(mb_rru, size_lqb(*is_64, op.is_8bit())), + show_ireg_sized(dst.to_reg(), mb_rru, size_lqb(*is_64, op.is_8bit())), ), Inst::UnaryRmR { src, dst, op, size } => format!( @@ -2065,6 +2083,17 @@ impl Amode { } } } + + /// Offset the amode by a fixed offset. + pub(crate) fn offset(&self, offset: u32) -> Self { + let mut ret = self.clone(); + match &mut ret { + &mut Amode::ImmReg { ref mut simm32, .. } => *simm32 += offset, + &mut Amode::ImmRegRegShift { ref mut simm32, .. } => *simm32 += offset, + _ => panic!("Cannot offset amode: {:?}", self), + } + ret + } } impl RegMemImm { @@ -2548,77 +2577,88 @@ impl MachInst for Inst { ty: Type, mut alloc_tmp: F, ) -> SmallVec<[Self; 4]> { - // We don't support 128-bit constants. - assert!(value <= u64::MAX as u128); let mut ret = SmallVec::new(); - let to_reg = to_regs - .only_reg() - .expect("multi-reg values not supported on x64"); - if ty == types::F32 { - if value == 0 { - ret.push(Inst::xmm_rm_r( - SseOpcode::Xorps, - RegMem::reg(to_reg.to_reg()), - to_reg, - )); - } else { - let tmp = alloc_tmp(types::I32); - ret.push(Inst::imm(OperandSize::Size32, value as u64, tmp)); - - ret.push(Inst::gpr_to_xmm( - SseOpcode::Movd, - RegMem::reg(tmp.to_reg()), - OperandSize::Size32, - to_reg, - )); - } - } else if ty == types::F64 { - if value == 0 { - ret.push(Inst::xmm_rm_r( - SseOpcode::Xorpd, - RegMem::reg(to_reg.to_reg()), - to_reg, - )); - } else { - let tmp = alloc_tmp(types::I64); - ret.push(Inst::imm(OperandSize::Size64, value as u64, tmp)); - - ret.push(Inst::gpr_to_xmm( - SseOpcode::Movq, - RegMem::reg(tmp.to_reg()), - OperandSize::Size64, - to_reg, - )); - } + if ty == types::I128 { + ret.push(Inst::imm( + OperandSize::Size64, + value as u64, + to_regs.regs()[0], + )); + ret.push(Inst::imm( + OperandSize::Size64, + (value >> 64) as u64, + to_regs.regs()[1], + )); } else { - // Must be an integer type. - debug_assert!( - ty == types::B1 - || ty == types::I8 - || ty == types::B8 - || ty == types::I16 - || ty == types::B16 - || ty == types::I32 - || ty == types::B32 - || ty == types::I64 - || ty == types::B64 - || ty == types::R32 - || ty == types::R64 - ); - if value == 0 { - ret.push(Inst::alu_rmi_r( - ty == types::I64, - AluRmiROpcode::Xor, - RegMemImm::reg(to_reg.to_reg()), - to_reg, - )); + let to_reg = to_regs + .only_reg() + .expect("multi-reg values not supported on x64"); + if ty == types::F32 { + if value == 0 { + ret.push(Inst::xmm_rm_r( + SseOpcode::Xorps, + RegMem::reg(to_reg.to_reg()), + to_reg, + )); + } else { + let tmp = alloc_tmp(types::I32); + ret.push(Inst::imm(OperandSize::Size32, value as u64, tmp)); + + ret.push(Inst::gpr_to_xmm( + SseOpcode::Movd, + RegMem::reg(tmp.to_reg()), + OperandSize::Size32, + to_reg, + )); + } + } else if ty == types::F64 { + if value == 0 { + ret.push(Inst::xmm_rm_r( + SseOpcode::Xorpd, + RegMem::reg(to_reg.to_reg()), + to_reg, + )); + } else { + let tmp = alloc_tmp(types::I64); + ret.push(Inst::imm(OperandSize::Size64, value as u64, tmp)); + + ret.push(Inst::gpr_to_xmm( + SseOpcode::Movq, + RegMem::reg(tmp.to_reg()), + OperandSize::Size64, + to_reg, + )); + } } else { - let value = value as u64; - ret.push(Inst::imm( - OperandSize::from_bytes(ty.bytes()), - value.into(), - to_reg, - )); + // Must be an integer type. + debug_assert!( + ty == types::B1 + || ty == types::I8 + || ty == types::B8 + || ty == types::I16 + || ty == types::B16 + || ty == types::I32 + || ty == types::B32 + || ty == types::I64 + || ty == types::B64 + || ty == types::R32 + || ty == types::R64 + ); + if value == 0 { + ret.push(Inst::alu_rmi_r( + ty == types::I64, + AluRmiROpcode::Xor, + RegMemImm::reg(to_reg.to_reg()), + to_reg, + )); + } else { + let value = value as u64; + ret.push(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + value.into(), + to_reg, + )); + } } } ret diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 9293221de5..a25da666b3 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -18,7 +18,7 @@ use alloc::vec::Vec; use cranelift_codegen_shared::condcodes::CondCode; use log::trace; use regalloc::{Reg, RegClass, Writable}; -use smallvec::SmallVec; +use smallvec::{smallvec, SmallVec}; use std::convert::TryFrom; use target_lexicon::Triple; @@ -28,6 +28,7 @@ use target_lexicon::Triple; fn is_int_or_ref_ty(ty: Type) -> bool { match ty { types::I8 | types::I16 | types::I32 | types::I64 | types::R64 => true, + types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true, types::R32 => panic!("shouldn't have 32-bits refs on x64"), _ => false, } @@ -107,23 +108,26 @@ fn generate_constant>(ctx: &mut C, ty: Type, c: u64) -> Va non_writable_value_regs(cst_copy) } -/// Put the given input into a register, and mark it as used (side-effect). -fn put_input_in_reg>(ctx: &mut C, spec: InsnInput) -> Reg { +/// Put the given input into possibly multiple registers, and mark it as used (side-effect). +fn put_input_in_regs>(ctx: &mut C, spec: InsnInput) -> ValueRegs { let ty = ctx.input_ty(spec.insn, spec.input); let input = ctx.get_input_as_source_or_const(spec.insn, spec.input); if let Some(c) = input.constant { // Generate constants fresh at each use to minimize long-range register pressure. generate_constant(ctx, ty, c) - .only_reg() - .expect("multi-reg values not supported yet") } else { ctx.put_input_in_regs(spec.insn, spec.input) - .only_reg() - .expect("multi-reg values not supported yet") } } +/// Put the given input into a register, and mark it as used (side-effect). +fn put_input_in_reg>(ctx: &mut C, spec: InsnInput) -> Reg { + put_input_in_regs(ctx, spec) + .only_reg() + .expect("Multi-register value not expected") +} + /// Determines whether a load operation (indicated by `src_insn`) can be merged /// into the current lowering point. If so, returns the address-base source (as /// an `InsnInput`) and an offset from that address from which to perform the @@ -373,25 +377,120 @@ fn emit_extract_lane>( /// /// Note: make sure that there are no instructions modifying the flags between a call to this /// function and the use of the flags! -fn emit_cmp>(ctx: &mut C, insn: IRInst) { +/// +/// Takes the condition code that will be tested, and returns +/// the condition code that should be used. This allows us to +/// synthesize comparisons out of multiple instructions for +/// special cases (e.g., 128-bit integers). +fn emit_cmp>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntCC { let ty = ctx.input_ty(insn, 0); let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; - // TODO Try to commute the operands (and invert the condition) if one is an immediate. - let lhs = put_input_in_reg(ctx, inputs[0]); - // We force the RHS into a register, and disallow load-op fusion, because we - // do not have a transitive guarantee that this cmp-site will be the sole - // user of the value. Consider: the icmp might be the only user of a load, - // but there may be multiple users of the icmp (e.g. select or bint - // instructions) that each invoke `emit_cmp()`. If we were to allow a load - // to sink to the *latest* one, but other sites did not permit sinking, then - // we would be missing the load for other cmp-sites. - let rhs = put_input_in_reg(ctx, inputs[1]); + if ty == types::I128 { + // We need to compare both halves and combine the results appropriately. + let cmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let cmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let lhs = put_input_in_regs(ctx, inputs[0]); + let lhs_lo = lhs.regs()[0]; + let lhs_hi = lhs.regs()[1]; + let rhs = put_input_in_regs(ctx, inputs[1]); + let rhs_lo = RegMemImm::reg(rhs.regs()[0]); + let rhs_hi = RegMemImm::reg(rhs.regs()[1]); + match cc { + IntCC::Equal => { + ctx.emit(Inst::cmp_rmi_r(8, rhs_hi, lhs_hi)); + ctx.emit(Inst::setcc(CC::Z, cmp1)); + ctx.emit(Inst::cmp_rmi_r(8, rhs_lo, lhs_lo)); + ctx.emit(Inst::setcc(CC::Z, cmp2)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(cmp1.to_reg()), + cmp2, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::imm(1), + cmp2, + )); + IntCC::NotEqual + } + IntCC::NotEqual => { + ctx.emit(Inst::cmp_rmi_r(8, rhs_hi, lhs_hi)); + ctx.emit(Inst::setcc(CC::NZ, cmp1)); + ctx.emit(Inst::cmp_rmi_r(8, rhs_lo, lhs_lo)); + ctx.emit(Inst::setcc(CC::NZ, cmp2)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(cmp1.to_reg()), + cmp2, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::imm(1), + cmp2, + )); + IntCC::NotEqual + } + IntCC::SignedLessThan + | IntCC::SignedLessThanOrEqual + | IntCC::SignedGreaterThan + | IntCC::SignedGreaterThanOrEqual + | IntCC::UnsignedLessThan + | IntCC::UnsignedLessThanOrEqual + | IntCC::UnsignedGreaterThan + | IntCC::UnsignedGreaterThanOrEqual => { + // Result = (lhs_hi <> rhs_hi) || + // (lhs_hi == rhs_hi && lhs_lo <> rhs_lo) + let cmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + ctx.emit(Inst::cmp_rmi_r(8, rhs_hi, lhs_hi)); + ctx.emit(Inst::setcc(CC::from_intcc(cc.without_equal()), cmp1)); + ctx.emit(Inst::setcc(CC::Z, cmp2)); + ctx.emit(Inst::cmp_rmi_r(8, rhs_lo, lhs_lo)); + ctx.emit(Inst::setcc(CC::from_intcc(cc.unsigned()), cmp3)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(cmp2.to_reg()), + cmp3, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(cmp1.to_reg()), + cmp3, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::imm(1), + cmp3, + )); + IntCC::NotEqual + } + _ => panic!("Unhandled IntCC in I128 comparison: {:?}", cc), + } + } else { + // TODO Try to commute the operands (and invert the condition) if one is an immediate. + let lhs = put_input_in_reg(ctx, inputs[0]); + // We force the RHS into a register, and disallow load-op fusion, because we + // do not have a transitive guarantee that this cmp-site will be the sole + // user of the value. Consider: the icmp might be the only user of a load, + // but there may be multiple users of the icmp (e.g. select or bint + // instructions) that each invoke `emit_cmp()`. If we were to allow a load + // to sink to the *latest* one, but other sites did not permit sinking, then + // we would be missing the load for other cmp-sites. + let rhs = put_input_in_reg(ctx, inputs[1]); - // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives - // us dst - src at the machine instruction level, so invert operands. - ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::reg(rhs), lhs)); + // Cranelift's icmp semantics want to compare lhs - rhs, while Intel gives + // us dst - src at the machine instruction level, so invert operands. + ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, RegMemImm::reg(rhs), lhs)); + cc + } } /// A specification for a fcmp emission. @@ -489,6 +588,458 @@ fn emit_fcmp>( cond_result } +fn emit_bitrev>(ctx: &mut C, src: Reg, dst: Writable, ty: Type) { + let bits = ty.bits(); + let const_mask = if bits == 64 { + 0xffff_ffff_ffff_ffff + } else { + (1u64 << bits) - 1 + }; + let tmp0 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + + ctx.emit(Inst::gen_move(tmp0, src, types::I64)); + + // Swap 1-bit units. + // tmp1 = src + ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64)); + // tmp2 = 0b0101.. + ctx.emit(Inst::imm( + OperandSize::Size64, + 0x5555_5555_5555_5555 & const_mask, + tmp2, + )); + // tmp1 = src >> 1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + // tmp1 = (src >> 1) & 0b0101.. + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp2.to_reg()), + tmp1, + )); + // tmp2 = src & 0b0101.. + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp0.to_reg()), + tmp2, + )); + // tmp2 = (src & 0b0101..) << 1 + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(1), tmp2)); + // tmp0 = (src >> 1) & 0b0101.. | (src & 0b0101..) << 1 + ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp1.to_reg()), + tmp0, + )); + + // Swap 2-bit units. + ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64)); + ctx.emit(Inst::imm( + OperandSize::Size64, + 0x3333_3333_3333_3333 & const_mask, + tmp2, + )); + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(2), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp2.to_reg()), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp0.to_reg()), + tmp2, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(2), tmp2)); + ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp1.to_reg()), + tmp0, + )); + + // Swap 4-bit units. + ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64)); + ctx.emit(Inst::imm( + OperandSize::Size64, + 0x0f0f_0f0f_0f0f_0f0f & const_mask, + tmp2, + )); + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(4), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp2.to_reg()), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp0.to_reg()), + tmp2, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(4), tmp2)); + ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp1.to_reg()), + tmp0, + )); + + if bits > 8 { + // Swap 8-bit units. + ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64)); + ctx.emit(Inst::imm( + OperandSize::Size64, + 0x00ff_00ff_00ff_00ff & const_mask, + tmp2, + )); + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(8), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp2.to_reg()), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp0.to_reg()), + tmp2, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(8), tmp2)); + ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp1.to_reg()), + tmp0, + )); + } + + if bits > 16 { + // Swap 16-bit units. + ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64)); + ctx.emit(Inst::imm( + OperandSize::Size64, + 0x0000_ffff_0000_ffff & const_mask, + tmp2, + )); + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(16), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp2.to_reg()), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp0.to_reg()), + tmp2, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(16), tmp2)); + ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp1.to_reg()), + tmp0, + )); + } + + if bits > 32 { + // Swap 32-bit units. + ctx.emit(Inst::gen_move(tmp1, tmp0.to_reg(), types::I64)); + ctx.emit(Inst::imm( + OperandSize::Size64, + 0x0000_0000_ffff_ffff & const_mask, + tmp2, + )); + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(32), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp2.to_reg()), + tmp1, + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::reg(tmp0.to_reg()), + tmp2, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, Some(32), tmp2)); + ctx.emit(Inst::gen_move(tmp0, tmp2.to_reg(), types::I64)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp1.to_reg()), + tmp0, + )); + } + + ctx.emit(Inst::gen_move(dst, tmp0.to_reg(), types::I64)); +} + +fn emit_shl_i128>( + ctx: &mut C, + src: ValueRegs, + dst: ValueRegs>, + amt_src: Reg, +) { + let src_lo = src.regs()[0]; + let src_hi = src.regs()[1]; + let dst_lo = dst.regs()[0]; + let dst_hi = dst.regs()[1]; + + // mov tmp1, src_lo + // shl tmp1, amt_src + // mov tmp2, src_hi + // shl tmp2, amt_src + // mov amt, 64 + // sub amt, amt_src + // mov tmp3, src_lo + // shr tmp3, amt + // or tmp3, tmp2 + // xor dst_lo, dst_lo + // mov amt, amt_src + // and amt, 64 + // cmovz dst_hi, tmp3 + // cmovz dst_lo, tmp1 + // cmovnz dst_hi, tmp1 + + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + + ctx.emit(Inst::gen_move(tmp1, src_lo, types::I64)); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rcx()), + amt_src, + types::I64, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, None, tmp1)); + + ctx.emit(Inst::gen_move(tmp2, src_hi, types::I64)); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rcx()), + amt_src, + types::I64, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, None, tmp2)); + + ctx.emit(Inst::imm(OperandSize::Size64, 64, amt)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Sub, + RegMemImm::reg(amt_src), + amt, + )); + + ctx.emit(Inst::gen_move(tmp3, src_lo, types::I64)); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rcx()), + amt.to_reg(), + types::I64, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, None, tmp3)); + + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp2.to_reg()), + tmp3, + )); + + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Xor, + RegMemImm::reg(dst_lo.to_reg()), + dst_lo, + )); + // This isn't semantically necessary, but it keeps the + // register allocator happy, because it cannot otherwise + // infer that cmovz + cmovnz always defines dst_hi. + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Xor, + RegMemImm::reg(dst_hi.to_reg()), + dst_hi, + )); + + ctx.emit(Inst::gen_move(amt, amt_src, types::I64)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::imm(64), + amt, + )); + ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp3.to_reg()), dst_hi)); + ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp1.to_reg()), dst_lo)); + ctx.emit(Inst::cmove(8, CC::NZ, RegMem::reg(tmp1.to_reg()), dst_hi)); +} + +fn emit_shr_i128>( + ctx: &mut C, + src: ValueRegs, + dst: ValueRegs>, + amt_src: Reg, + is_signed: bool, +) { + let src_lo = src.regs()[0]; + let src_hi = src.regs()[1]; + let dst_lo = dst.regs()[0]; + let dst_hi = dst.regs()[1]; + + // mov tmp1, src_hi + // {u,s}shr tmp1, amt_src + // mov tmp2, src_lo + // {u,s}shr tmp2, amt_src + // mov amt, 64 + // sub amt, amt_src + // mov tmp3, src_hi + // shl tmp3, amt + // or tmp3, tmp2 + // if is_signed: + // mov dst_hi, src_hi + // sshr dst_hi, 63 // get the sign bit + // else: + // xor dst_hi, dst_hi + // mov amt, amt_src + // and amt, 64 + // cmovz dst_hi, tmp1 + // cmovz dst_lo, tmp3 + // cmovnz dst_lo, tmp1 + + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp3 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let amt = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + + let shift_kind = if is_signed { + ShiftKind::ShiftRightArithmetic + } else { + ShiftKind::ShiftRightLogical + }; + + ctx.emit(Inst::gen_move(tmp1, src_hi, types::I64)); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rcx()), + amt_src, + types::I64, + )); + ctx.emit(Inst::shift_r(8, shift_kind, None, tmp1)); + + ctx.emit(Inst::gen_move(tmp2, src_lo, types::I64)); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rcx()), + amt_src, + types::I64, + )); + ctx.emit(Inst::shift_r(8, shift_kind, None, tmp2)); + + ctx.emit(Inst::imm(OperandSize::Size64, 64, amt)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Sub, + RegMemImm::reg(amt_src), + amt, + )); + + ctx.emit(Inst::gen_move(tmp3, src_hi, types::I64)); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rcx()), + amt.to_reg(), + types::I64, + )); + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftLeft, None, tmp3)); + + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp2.to_reg()), + tmp3, + )); + + if is_signed { + ctx.emit(Inst::gen_move(dst_hi, src_hi, types::I64)); + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightArithmetic, + Some(63), + dst_hi, + )); + } else { + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Xor, + RegMemImm::reg(dst_hi.to_reg()), + dst_hi, + )); + } + // This isn't semantically necessary, but it keeps the + // register allocator happy, because it cannot otherwise + // infer that cmovz + cmovnz always defines dst_lo. + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Xor, + RegMemImm::reg(dst_lo.to_reg()), + dst_lo, + )); + + ctx.emit(Inst::gen_move(amt, amt_src, types::I64)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::And, + RegMemImm::imm(64), + amt, + )); + ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp1.to_reg()), dst_hi)); + ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp3.to_reg()), dst_lo)); + ctx.emit(Inst::cmove(8, CC::NZ, RegMem::reg(tmp1.to_reg()), dst_lo)); +} + fn make_libcall_sig>( ctx: &mut C, insn: IRInst, @@ -676,6 +1227,101 @@ fn lower_to_amode>(ctx: &mut C, spec: InsnInput, offset: i Amode::imm_reg(offset as u32, input).with_flags(flags) } +fn emit_moves>( + ctx: &mut C, + dst: ValueRegs>, + src: ValueRegs, + ty: Type, +) { + let (_, tys) = Inst::rc_for_type(ty).unwrap(); + for ((dst, src), ty) in dst.regs().iter().zip(src.regs().iter()).zip(tys.iter()) { + ctx.emit(Inst::gen_move(*dst, *src, *ty)); + } +} + +fn emit_cmoves>( + ctx: &mut C, + size: u8, + cc: CC, + src: ValueRegs, + dst: ValueRegs>, +) { + let size = size / src.len() as u8; + let size = u8::max(size, 4); // at least 32 bits + for (dst, src) in dst.regs().iter().zip(src.regs().iter()) { + ctx.emit(Inst::cmove(size, cc, RegMem::reg(*src), *dst)); + } +} + +fn emit_clz>( + ctx: &mut C, + orig_ty: Type, + ty: Type, + src: Reg, + dst: Writable, +) { + let src = RegMem::reg(src); + let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); + ctx.emit(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + u64::max_value(), + dst, + )); + + ctx.emit(Inst::unary_rm_r( + ty.bytes() as u8, + UnaryRmROpcode::Bsr, + src, + tmp, + )); + + ctx.emit(Inst::cmove( + ty.bytes() as u8, + CC::Z, + RegMem::reg(dst.to_reg()), + tmp, + )); + + ctx.emit(Inst::imm( + OperandSize::from_bytes(ty.bytes()), + orig_ty.bits() as u64 - 1, + dst, + )); + + ctx.emit(Inst::alu_rmi_r( + ty == types::I64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp.to_reg()), + dst, + )); +} + +fn emit_ctz>( + ctx: &mut C, + orig_ty: Type, + ty: Type, + src: Reg, + dst: Writable, +) { + let src = RegMem::reg(src); + let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); + ctx.emit(Inst::imm(OperandSize::Size32, orig_ty.bits() as u64, tmp)); + + ctx.emit(Inst::unary_rm_r( + ty.bytes() as u8, + UnaryRmROpcode::Bsf, + src, + dst, + )); + + ctx.emit(Inst::cmove( + ty.bytes() as u8, + CC::Z, + RegMem::reg(tmp.to_reg()), + dst, + )); +} + //============================================================================= // Top-level instruction lowering entry point, for one instruction. @@ -898,6 +1544,102 @@ fn lower_insn_to_regs>( // Move the `lhs` to the same register as `dst`. ctx.emit(Inst::gen_move(dst, lhs, ty)); ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + } else if ty == types::I128 || ty == types::B128 { + let alu_ops = match op { + Opcode::Iadd => (AluRmiROpcode::Add, AluRmiROpcode::Adc), + Opcode::Isub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb), + // multiply handled specially below + Opcode::Imul => (AluRmiROpcode::Mul, AluRmiROpcode::Mul), + Opcode::Band => (AluRmiROpcode::And, AluRmiROpcode::And), + Opcode::Bor => (AluRmiROpcode::Or, AluRmiROpcode::Or), + Opcode::Bxor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor), + _ => panic!("Unsupported opcode with 128-bit integers: {:?}", op), + }; + let lhs = put_input_in_regs(ctx, inputs[0]); + let rhs = put_input_in_regs(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + assert_eq!(lhs.len(), 2); + assert_eq!(rhs.len(), 2); + assert_eq!(dst.len(), 2); + + if op != Opcode::Imul { + // add, sub, and, or, xor: just do ops on lower then upper half. Carry-flag + // propagation is implicit (add/adc, sub/sbb). + ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64)); + ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64)); + ctx.emit(Inst::alu_rmi_r( + /* is_64 = */ true, + alu_ops.0, + RegMemImm::reg(rhs.regs()[0]), + dst.regs()[0], + )); + ctx.emit(Inst::alu_rmi_r( + /* is_64 = */ true, + alu_ops.1, + RegMemImm::reg(rhs.regs()[1]), + dst.regs()[1], + )); + } else { + // mul: + // dst_lo = lhs_lo * rhs_lo + // dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo + // + // so we emit: + // mov dst_lo, lhs_lo + // mul dst_lo, rhs_lo + // mov dst_hi, lhs_lo + // mul dst_hi, rhs_hi + // mov tmp, lhs_hi + // mul tmp, rhs_lo + // add dst_hi, tmp + // mov rax, lhs_lo + // umulhi rhs_lo // implicit rax arg/dst + // add dst_hi, rax + let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64)); + ctx.emit(Inst::alu_rmi_r( + /* is_64 = */ true, + AluRmiROpcode::Mul, + RegMemImm::reg(rhs.regs()[0]), + dst.regs()[0], + )); + ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[0], types::I64)); + ctx.emit(Inst::alu_rmi_r( + /* is_64 = */ true, + AluRmiROpcode::Mul, + RegMemImm::reg(rhs.regs()[1]), + dst.regs()[1], + )); + ctx.emit(Inst::gen_move(tmp, lhs.regs()[1], types::I64)); + ctx.emit(Inst::alu_rmi_r( + /* is_64 = */ true, + AluRmiROpcode::Mul, + RegMemImm::reg(rhs.regs()[0]), + tmp, + )); + ctx.emit(Inst::alu_rmi_r( + /* is_64 = */ true, + AluRmiROpcode::Add, + RegMemImm::reg(tmp.to_reg()), + dst.regs()[1], + )); + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::rax()), + lhs.regs()[0], + types::I64, + )); + ctx.emit(Inst::mul_hi( + /* size = */ 8, + /* signed = */ false, + RegMem::reg(rhs.regs()[0]), + )); + ctx.emit(Inst::alu_rmi_r( + /* is_64 = */ true, + AluRmiROpcode::Add, + RegMemImm::reg(regs::rdx()), + dst.regs()[1], + )); + } } else { let is_64 = ty == types::I64; let alu_op = match op { @@ -1022,17 +1764,27 @@ fn lower_insn_to_regs>( Opcode::Bnot => { let ty = ty.unwrap(); let size = ty.bytes() as u8; - let src = put_input_in_reg(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::gen_move(dst, src, ty)); if ty.is_vector() { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::gen_move(dst, src, ty)); let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); ctx.emit(Inst::equals(ty, RegMem::from(tmp), tmp)); ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst)); + } else if ty == types::I128 || ty == types::B128 { + let src = put_input_in_regs(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst.regs()[0], src.regs()[0], types::I64)); + ctx.emit(Inst::not(8, dst.regs()[0])); + ctx.emit(Inst::gen_move(dst.regs()[1], src.regs()[1], types::I64)); + ctx.emit(Inst::not(8, dst.regs()[1])); } else if ty.is_bool() { unimplemented!("bool bnot") } else { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::gen_move(dst, src, ty)); ctx.emit(Inst::not(size, dst)); } } @@ -1064,7 +1816,7 @@ fn lower_insn_to_regs>( let dst_ty = ctx.output_ty(insn, 0); debug_assert_eq!(ctx.input_ty(insn, 0), dst_ty); - if !dst_ty.is_vector() { + if !dst_ty.is_vector() && dst_ty.bits() <= 64 { // Scalar shifts on x86 have various encodings: // - shift by one bit, e.g. `SAL r/m8, 1` (not used here) // - shift by an immediate amount, e.g. `SAL r/m8, imm8` @@ -1118,6 +1870,89 @@ fn lower_insn_to_regs>( ctx.emit(Inst::mov_r_r(true, rhs.unwrap(), w_rcx)); } ctx.emit(Inst::shift_r(size, shift_kind, count, dst)); + } else if dst_ty == types::I128 { + let amt_src = put_input_in_reg(ctx, inputs[1]); + let src = put_input_in_regs(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + + match op { + Opcode::Ishl => { + emit_shl_i128(ctx, src, dst, amt_src); + } + Opcode::Ushr => { + emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ false); + } + Opcode::Sshr => { + emit_shr_i128(ctx, src, dst, amt_src, /* is_signed = */ true); + } + Opcode::Rotl => { + // (mov tmp, src) + // (shl.i128 tmp, amt) + // (mov dst, src) + // (ushr.i128 dst, 128-amt) + // (or dst, tmp) + let tmp = ctx.alloc_tmp(types::I128); + emit_shl_i128(ctx, src, tmp, amt_src); + let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Sub, + RegMemImm::reg(amt_src), + inv_amt, + )); + emit_shr_i128( + ctx, + src, + dst, + inv_amt.to_reg(), + /* is_signed = */ false, + ); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp.regs()[0].to_reg()), + dst.regs()[0], + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp.regs()[1].to_reg()), + dst.regs()[1], + )); + } + Opcode::Rotr => { + // (mov tmp, src) + // (ushr.i128 tmp, amt) + // (mov dst, src) + // (shl.i128 dst, 128-amt) + // (or dst, tmp) + let tmp = ctx.alloc_tmp(types::I128); + emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false); + let inv_amt = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + ctx.emit(Inst::imm(OperandSize::Size64, 128, inv_amt)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Sub, + RegMemImm::reg(amt_src), + inv_amt, + )); + emit_shl_i128(ctx, src, dst, inv_amt.to_reg()); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp.regs()[0].to_reg()), + dst.regs()[0], + )); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Or, + RegMemImm::reg(tmp.regs()[1].to_reg()), + dst.regs()[1], + )); + } + _ => unreachable!(), + } } else if dst_ty == types::I8X16 && (op == Opcode::Ishl || op == Opcode::Ushr) { // Since the x86 instruction set does not have any 8x16 shift instructions (even in higher feature sets // like AVX), we lower the `ishl.i8x16` and `ushr.i8x16` to a sequence of instructions. The basic idea, @@ -1449,52 +2284,50 @@ fn lower_insn_to_regs>( // mov $(size_bits - 1), %dst // sub %tmp, %dst - let (ext_spec, ty) = match ctx.input_ty(insn, 0) { - types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), - a if a == types::I32 || a == types::I64 => (None, a), - _ => unreachable!(), - }; - - let src = if let Some(ext_spec) = ext_spec { - RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) + let orig_ty = ty.unwrap(); + if orig_ty == types::I128 { + // clz upper, tmp1 + // clz lower, dst + // add dst, 64 + // cmp tmp1, 64 + // cmovnz tmp1, dst + let dsts = get_output_reg(ctx, outputs[0]); + let dst = dsts.regs()[0]; + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let srcs = put_input_in_regs(ctx, inputs[0]); + let src_lo = srcs.regs()[0]; + let src_hi = srcs.regs()[1]; + emit_clz(ctx, types::I64, types::I64, src_hi, tmp1); + emit_clz(ctx, types::I64, types::I64, src_lo, dst); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(64), + dst, + )); + ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(64), tmp1.to_reg())); + ctx.emit(Inst::cmove(8, CC::NZ, RegMem::reg(tmp1.to_reg()), dst)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Xor, + RegMemImm::reg(dsts.regs()[1].to_reg()), + dsts.regs()[1], + )); } else { - input_to_reg_mem(ctx, inputs[0]) - }; - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let (ext_spec, ty) = match orig_ty { + types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), + a if a == types::I32 || a == types::I64 => (None, a), + _ => unreachable!(), + }; + let src = if let Some(ext_spec) = ext_spec { + extend_input_to_reg(ctx, inputs[0], ext_spec) + } else { + put_input_in_reg(ctx, inputs[0]) + }; - let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); - ctx.emit(Inst::imm( - OperandSize::from_bytes(ty.bytes()), - u64::max_value(), - dst, - )); - - ctx.emit(Inst::unary_rm_r( - ty.bytes() as u8, - UnaryRmROpcode::Bsr, - src, - tmp, - )); - - ctx.emit(Inst::cmove( - ty.bytes() as u8, - CC::Z, - RegMem::reg(dst.to_reg()), - tmp, - )); - - ctx.emit(Inst::imm( - OperandSize::from_bytes(ty.bytes()), - ty.bits() as u64 - 1, - dst, - )); - - ctx.emit(Inst::alu_rmi_r( - ty == types::I64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp.to_reg()), - dst, - )); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + emit_clz(ctx, orig_ty, ty, src, dst); + } } Opcode::Ctz => { @@ -1504,29 +2337,47 @@ fn lower_insn_to_regs>( // bsf %src, %dst // mov $(size_bits), %tmp // cmovz %tmp, %dst - let ty = ctx.input_ty(insn, 0); - let ty = if ty.bits() < 32 { types::I32 } else { ty }; - debug_assert!(ty == types::I32 || ty == types::I64); + let orig_ty = ctx.input_ty(insn, 0); + if orig_ty == types::I128 { + // ctz src_lo, dst + // ctz src_hi, tmp1 + // add tmp1, 64 + // cmp dst, 64 + // cmovz tmp1, dst + let dsts = get_output_reg(ctx, outputs[0]); + let dst = dsts.regs()[0]; + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let srcs = put_input_in_regs(ctx, inputs[0]); + let src_lo = srcs.regs()[0]; + let src_hi = srcs.regs()[1]; + emit_ctz(ctx, types::I64, types::I64, src_lo, dst); + emit_ctz(ctx, types::I64, types::I64, src_hi, tmp1); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Add, + RegMemImm::imm(64), + tmp1, + )); + ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(64), dst.to_reg())); + ctx.emit(Inst::cmove(8, CC::Z, RegMem::reg(tmp1.to_reg()), dst)); + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Xor, + RegMemImm::reg(dsts.regs()[1].to_reg()), + dsts.regs()[1], + )); + } else { + let ty = if orig_ty.bits() < 32 { + types::I32 + } else { + orig_ty + }; + debug_assert!(ty == types::I32 || ty == types::I64); - let src = input_to_reg_mem(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); - ctx.emit(Inst::imm(OperandSize::Size32, ty.bits() as u64, tmp)); - - ctx.emit(Inst::unary_rm_r( - ty.bytes() as u8, - UnaryRmROpcode::Bsf, - src, - dst, - )); - - ctx.emit(Inst::cmove( - ty.bytes() as u8, - CC::Z, - RegMem::reg(tmp.to_reg()), - dst, - )); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + emit_ctz(ctx, orig_ty, ty, src, dst); + } } Opcode::Popcnt => { @@ -1535,272 +2386,329 @@ fn lower_insn_to_regs>( let (ext_spec, ty) = match ctx.input_ty(insn, 0) { types::I8 | types::I16 => (Some(ExtSpec::ZeroExtendTo32), types::I32), a if a == types::I32 || a == types::I64 => (None, a), + types::I128 => (None, types::I128), _ => unreachable!(), }; - let src = if let Some(ext_spec) = ext_spec { - RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)) + let (srcs, ty): (SmallVec<[RegMem; 2]>, Type) = if let Some(ext_spec) = ext_spec { + ( + smallvec![RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec))], + ty, + ) + } else if ty == types::I128 { + let regs = put_input_in_regs(ctx, inputs[0]); + ( + smallvec![RegMem::reg(regs.regs()[0]), RegMem::reg(regs.regs()[1])], + types::I64, + ) } else { // N.B.: explicitly put input in a reg here because the width of the instruction // into which this RM op goes may not match the width of the input type (in fact, // it won't for i32.popcnt), and we don't want a larger than necessary load. - RegMem::reg(put_input_in_reg(ctx, inputs[0])) + (smallvec![RegMem::reg(put_input_in_reg(ctx, inputs[0]))], ty) }; - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - if ty == types::I64 { - let is_64 = true; + let mut dsts: SmallVec<[Reg; 2]> = smallvec![]; + for src in srcs { + let dst = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + dsts.push(dst.to_reg()); + if ty == types::I64 { + let is_64 = true; - let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let cst = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - // mov src, tmp1 - ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - 8, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); - // mov 0x7777_7777_7777_7777, cst - ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst)); + // mov 0x7777_7777_7777_7777, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x7777777777777777, cst)); - // andq cst, tmp1 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::And, - RegMemImm::reg(cst.to_reg()), - tmp1, - )); + // andq cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); - // mov src, tmp2 - ctx.emit(Inst::mov64_rm_r(src, tmp2)); + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - 8, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); - // and cst, tmp1 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::And, - RegMemImm::reg(cst.to_reg()), - tmp1, - )); + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - 8, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); - // and cst, tmp1 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::And, - RegMemImm::reg(cst.to_reg()), - tmp1, - )); + // and cst, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + tmp1, + )); - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); - // mov tmp2, dst - ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); - // shr $4, dst - ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst)); + // shr $4, dst + ctx.emit(Inst::shift_r(8, ShiftKind::ShiftRightLogical, Some(4), dst)); - // add tmp2, dst - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::Add, - RegMemImm::reg(tmp2.to_reg()), - dst, - )); + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); - // mov $0x0F0F_0F0F_0F0F_0F0F, cst - ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst)); + // mov $0x0F0F_0F0F_0F0F_0F0F, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0F0F0F0F0F0F0F0F, cst)); - // and cst, dst - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::And, - RegMemImm::reg(cst.to_reg()), - dst, - )); + // and cst, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::reg(cst.to_reg()), + dst, + )); - // mov $0x0101_0101_0101_0101, cst - ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst)); + // mov $0x0101_0101_0101_0101, cst + ctx.emit(Inst::imm(OperandSize::Size64, 0x0101010101010101, cst)); - // mul cst, dst - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::Mul, - RegMemImm::reg(cst.to_reg()), - dst, - )); + // mul cst, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Mul, + RegMemImm::reg(cst.to_reg()), + dst, + )); - // shr $56, dst - ctx.emit(Inst::shift_r( - 8, - ShiftKind::ShiftRightLogical, - Some(56), - dst, - )); + // shr $56, dst + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(56), + dst, + )); + } else { + assert_eq!(ty, types::I32); + let is_64 = false; + + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + + // mov src, tmp1 + ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // andq $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // mov src, tmp2 + ctx.emit(Inst::mov64_rm_r(src, tmp2)); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and 0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // shr $1, tmp1 + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(1), + tmp1, + )); + + // and $0x7777_7777, tmp1 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x77777777), + tmp1, + )); + + // sub tmp1, tmp2 + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Sub, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + + // mov tmp2, dst + ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); + + // shr $4, dst + ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst)); + + // add tmp2, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Add, + RegMemImm::reg(tmp2.to_reg()), + dst, + )); + + // and $0x0F0F_0F0F, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::And, + RegMemImm::imm(0x0F0F0F0F), + dst, + )); + + // mul $0x0101_0101, dst + ctx.emit(Inst::alu_rmi_r( + is_64, + AluRmiROpcode::Mul, + RegMemImm::imm(0x01010101), + dst, + )); + + // shr $24, dst + ctx.emit(Inst::shift_r( + 4, + ShiftKind::ShiftRightLogical, + Some(24), + dst, + )); + } + } + + if dsts.len() == 1 { + let final_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::gen_move(final_dst, dsts[0], types::I64)); } else { - assert_eq!(ty, types::I32); - let is_64 = false; - - let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); - - // mov src, tmp1 - ctx.emit(Inst::mov64_rm_r(src.clone(), tmp1)); - - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - 4, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); - - // andq $0x7777_7777, tmp1 + assert!(dsts.len() == 2); + let final_dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(final_dst.regs()[0], dsts[0], types::I64)); ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::And, - RegMemImm::imm(0x77777777), - tmp1, - )); - - // mov src, tmp2 - ctx.emit(Inst::mov64_rm_r(src, tmp2)); - - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); - - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - 4, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); - - // and 0x7777_7777, tmp1 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::And, - RegMemImm::imm(0x77777777), - tmp1, - )); - - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); - - // shr $1, tmp1 - ctx.emit(Inst::shift_r( - 4, - ShiftKind::ShiftRightLogical, - Some(1), - tmp1, - )); - - // and $0x7777_7777, tmp1 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::And, - RegMemImm::imm(0x77777777), - tmp1, - )); - - // sub tmp1, tmp2 - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::Sub, - RegMemImm::reg(tmp1.to_reg()), - tmp2, - )); - - // mov tmp2, dst - ctx.emit(Inst::mov64_rm_r(RegMem::reg(tmp2.to_reg()), dst)); - - // shr $4, dst - ctx.emit(Inst::shift_r(4, ShiftKind::ShiftRightLogical, Some(4), dst)); - - // add tmp2, dst - ctx.emit(Inst::alu_rmi_r( - is_64, + true, AluRmiROpcode::Add, - RegMemImm::reg(tmp2.to_reg()), - dst, + RegMemImm::reg(dsts[1]), + final_dst.regs()[0], )); - - // and $0x0F0F_0F0F, dst ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::And, - RegMemImm::imm(0x0F0F0F0F), - dst, + true, + AluRmiROpcode::Xor, + RegMemImm::reg(final_dst.regs()[1].to_reg()), + final_dst.regs()[1], )); + } + } - // mul $0x0101_0101, dst - ctx.emit(Inst::alu_rmi_r( - is_64, - AluRmiROpcode::Mul, - RegMemImm::imm(0x01010101), - dst, - )); + Opcode::Bitrev => { + let ty = ctx.input_ty(insn, 0); + assert!( + ty == types::I8 + || ty == types::I16 + || ty == types::I32 + || ty == types::I64 + || ty == types::I128 + ); - // shr $24, dst - ctx.emit(Inst::shift_r( - 4, - ShiftKind::ShiftRightLogical, - Some(24), - dst, - )); + if ty == types::I128 { + let src = put_input_in_regs(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + emit_bitrev(ctx, src.regs()[0], dst.regs()[1], types::I64); + emit_bitrev(ctx, src.regs()[1], dst.regs()[0], types::I64); + } else { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + emit_bitrev(ctx, src, dst, ty); } } @@ -1836,63 +2744,112 @@ fn lower_insn_to_regs>( let src_ty = ctx.input_ty(insn, 0); let dst_ty = ctx.output_ty(insn, 0); - // Sextend requires a sign-extended move, but all the other opcodes are simply a move - // from a zero-extended source. Here is why this works, in each case: - // - // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to - // zero-extend here. - // - // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so - // again, this is a zero-extend / no-op. - // - // - Ireduce: changing width of an integer. Smaller ints are stored with undefined - // high-order bits, so we can simply do a copy. + if src_ty == types::I128 { + assert!(dst_ty.bits() <= 64); + assert!(op == Opcode::Ireduce); + let src = put_input_in_regs(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::gen_move(dst, src.regs()[0], types::I64)); + } else if dst_ty == types::I128 { + assert!(src_ty.bits() <= 64); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]); + assert!(op == Opcode::Uextend || op == Opcode::Sextend || op == Opcode::Bint); + // Extend to 64 bits first. - if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend { - // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on - // 32-bits will zero-extend the upper 32-bits, so we can even not generate a - // zero-extended move in this case. - // TODO add loads and shifts here. - if let Some(_) = matches_input_any( - ctx, - inputs[0], - &[ - Opcode::Iadd, - Opcode::IaddIfcout, - Opcode::Isub, - Opcode::Imul, - Opcode::Band, - Opcode::Bor, - Opcode::Bxor, - ], - ) { - let src = put_input_in_reg(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - ctx.emit(Inst::gen_move(dst, src, types::I64)); - return Ok(()); - } - } - - let src = input_to_reg_mem(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits()); - assert_eq!( - src_ty.bits() < dst_ty.bits(), - ext_mode.is_some(), - "unexpected extension: {} -> {}", - src_ty, - dst_ty - ); - - if let Some(ext_mode) = ext_mode { - if op == Opcode::Sextend { - ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst)); + let ext_mode = ExtMode::new(src_ty.bits(), /* dst bits = */ 64); + if let Some(ext_mode) = ext_mode { + if op == Opcode::Sextend { + ctx.emit(Inst::movsx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0])); + } else { + ctx.emit(Inst::movzx_rm_r(ext_mode, RegMem::reg(src), dst.regs()[0])); + } } else { - ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst)); + ctx.emit(Inst::mov64_rm_r(RegMem::reg(src), dst.regs()[0])); + } + + // Now generate the top 64 bits. + if op == Opcode::Sextend { + // Sign-extend: move dst[0] into dst[1] and arithmetic-shift right by 63 bits + // to spread the sign bit across all bits. + ctx.emit(Inst::gen_move( + dst.regs()[1], + dst.regs()[0].to_reg(), + types::I64, + )); + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightArithmetic, + Some(63), + dst.regs()[1], + )); + } else { + // Zero-extend: just zero the top word. + ctx.emit(Inst::alu_rmi_r( + true, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.regs()[1].to_reg()), + dst.regs()[1], + )); } } else { - ctx.emit(Inst::mov64_rm_r(src, dst)); + // Sextend requires a sign-extended move, but all the other opcodes are simply a move + // from a zero-extended source. Here is why this works, in each case: + // + // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we merely need to + // zero-extend here. + // + // - Breduce, Bextend: changing width of a boolean. We represent a bool as a 0 or 1, so + // again, this is a zero-extend / no-op. + // + // - Ireduce: changing width of an integer. Smaller ints are stored with undefined + // high-order bits, so we can simply do a copy. + if src_ty == types::I32 && dst_ty == types::I64 && op != Opcode::Sextend { + // As a particular x64 extra-pattern matching opportunity, all the ALU opcodes on + // 32-bits will zero-extend the upper 32-bits, so we can even not generate a + // zero-extended move in this case. + // TODO add loads and shifts here. + if let Some(_) = matches_input_any( + ctx, + inputs[0], + &[ + Opcode::Iadd, + Opcode::IaddIfcout, + Opcode::Isub, + Opcode::Imul, + Opcode::Band, + Opcode::Bor, + Opcode::Bxor, + ], + ) { + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + ctx.emit(Inst::gen_move(dst, src, types::I64)); + return Ok(()); + } + } + + let src = input_to_reg_mem(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + + let ext_mode = ExtMode::new(src_ty.bits(), dst_ty.bits()); + assert_eq!( + src_ty.bits() < dst_ty.bits(), + ext_mode.is_some(), + "unexpected extension: {} -> {}", + src_ty, + dst_ty + ); + + if let Some(ext_mode) = ext_mode { + if op == Opcode::Sextend { + ctx.emit(Inst::movsx_rm_r(ext_mode, src, dst)); + } else { + ctx.emit(Inst::movzx_rm_r(ext_mode, src, dst)); + } + } else { + ctx.emit(Inst::mov64_rm_r(src, dst)); + } } } @@ -1901,7 +2858,7 @@ fn lower_insn_to_regs>( let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let ty = ctx.input_ty(insn, 0); if !ty.is_vector() { - emit_cmp(ctx, insn); + let condcode = emit_cmp(ctx, insn, condcode); let cc = CC::from_intcc(condcode); ctx.emit(Inst::setcc(cc, dst)); } else { @@ -2108,10 +3065,19 @@ fn lower_insn_to_regs>( Opcode::FallthroughReturn | Opcode::Return => { for i in 0..ctx.num_inputs(insn) { - let src_reg = put_input_in_reg(ctx, inputs[i]); + let src_reg = put_input_in_regs(ctx, inputs[i]); let retval_reg = ctx.retval(i); let ty = ctx.input_ty(insn, i); - ctx.emit(Inst::gen_move(retval_reg.only_reg().unwrap(), src_reg, ty)); + assert!(src_reg.len() == retval_reg.len()); + let (_, tys) = Inst::rc_for_type(ty)?; + for ((&src, &dst), &ty) in src_reg + .regs() + .iter() + .zip(retval_reg.regs().iter()) + .zip(tys.iter()) + { + ctx.emit(Inst::gen_move(dst, src, ty)); + } } // N.B.: the Ret itself is generated by the ABI. } @@ -2147,13 +3113,13 @@ fn lower_insn_to_regs>( abi.emit_stack_pre_adjust(ctx); assert_eq!(inputs.len(), abi.num_args()); for (i, input) in inputs.iter().enumerate() { - let arg_reg = put_input_in_reg(ctx, *input); - abi.emit_copy_regs_to_arg(ctx, i, ValueRegs::one(arg_reg)); + let arg_regs = put_input_in_regs(ctx, *input); + abi.emit_copy_regs_to_arg(ctx, i, arg_regs); } abi.emit_call(ctx); for (i, output) in outputs.iter().enumerate() { - let retval_reg = get_output_reg(ctx, *output).only_reg().unwrap(); - abi.emit_copy_retval_to_regs(ctx, i, ValueRegs::one(retval_reg)); + let retval_regs = get_output_reg(ctx, *output); + abi.emit_copy_retval_to_regs(ctx, i, retval_regs); } abi.emit_stack_post_adjust(ctx); } @@ -2180,11 +3146,11 @@ fn lower_insn_to_regs>( ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }); } else if op == Opcode::Trapif { let cond_code = ctx.data(insn).cond_code().unwrap(); - let cc = CC::from_intcc(cond_code); // Verification ensures that the input is always a single-def ifcmp. let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap(); - emit_cmp(ctx, ifcmp); + let cond_code = emit_cmp(ctx, ifcmp, cond_code); + let cc = CC::from_intcc(cond_code); ctx.emit_safepoint(Inst::TrapIf { trap_code, cc }); } else { @@ -2266,7 +3232,9 @@ fn lower_insn_to_regs>( Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => { let lhs = put_input_in_reg(ctx, inputs[0]); - let rhs = input_to_reg_mem(ctx, inputs[1]); + // We can't guarantee the RHS (if a load) is 128-bit aligned, so we + // must avoid merging a load here. + let rhs = RegMem::reg(put_input_in_reg(ctx, inputs[1])); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let ty = ty.unwrap(); @@ -2523,7 +3491,9 @@ fn lower_insn_to_regs>( } Opcode::FminPseudo | Opcode::FmaxPseudo => { - let lhs = input_to_reg_mem(ctx, inputs[0]); + // We can't guarantee the RHS (if a load) is 128-bit aligned, so we + // must avoid merging a load here. + let lhs = RegMem::reg(put_input_in_reg(ctx, inputs[0])); let rhs = put_input_in_reg(ctx, inputs[1]); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let ty = ty.unwrap(); @@ -2539,7 +3509,9 @@ fn lower_insn_to_regs>( } Opcode::Sqrt => { - let src = input_to_reg_mem(ctx, inputs[0]); + // We can't guarantee the RHS (if a load) is 128-bit aligned, so we + // must avoid merging a load here. + let src = RegMem::reg(put_input_in_reg(ctx, inputs[0])); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let ty = ty.unwrap(); @@ -2558,13 +3530,17 @@ fn lower_insn_to_regs>( } Opcode::Fpromote => { - let src = input_to_reg_mem(ctx, inputs[0]); + // We can't guarantee the RHS (if a load) is 128-bit aligned, so we + // must avoid merging a load here. + let src = RegMem::reg(put_input_in_reg(ctx, inputs[0])); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst)); } Opcode::Fdemote => { - let src = input_to_reg_mem(ctx, inputs[0]); + // We can't guarantee the RHS (if a load) is 128-bit aligned, so we + // must avoid merging a load here. + let src = RegMem::reg(put_input_in_reg(ctx, inputs[0])); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst)); } @@ -2581,7 +3557,7 @@ fn lower_insn_to_regs>( let src = match ext_spec { Some(ext_spec) => RegMem::reg(extend_input_to_reg(ctx, inputs[0], ext_spec)), - None => input_to_reg_mem(ctx, inputs[0]), + None => RegMem::reg(put_input_in_reg(ctx, inputs[0])), }; let opcode = if output_ty == types::F32 { @@ -3096,7 +4072,7 @@ fn lower_insn_to_regs>( } Opcode::Fabs | Opcode::Fneg => { - let src = input_to_reg_mem(ctx, inputs[0]); + let src = RegMem::reg(put_input_in_reg(ctx, inputs[0])); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); // In both cases, generate a constant and apply a single binary instruction: @@ -3392,59 +4368,64 @@ fn lower_insn_to_regs>( _ => unreachable!(), }; - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let is_xmm = elem_ty.is_float() || elem_ty.is_vector(); - - match (sign_extend, is_xmm) { - (true, false) => { - // The load is sign-extended only when the output size is lower than 64 bits, - // so ext-mode is defined in this case. - ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)); - } - (false, false) => { - if elem_ty.bytes() == 8 { - // Use a plain load. - ctx.emit(Inst::mov64_m_r(amode, dst)) - } else { - // Use a zero-extended load. - ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)) + if elem_ty == types::I128 { + let dsts = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::mov64_m_r(amode.clone(), dsts.regs()[0])); + ctx.emit(Inst::mov64_m_r(amode.offset(8), dsts.regs()[1])); + } else { + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let is_xmm = elem_ty.is_float() || elem_ty.is_vector(); + match (sign_extend, is_xmm) { + (true, false) => { + // The load is sign-extended only when the output size is lower than 64 bits, + // so ext-mode is defined in this case. + ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)); } - } - (_, true) => { - ctx.emit(match elem_ty { - types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst), - types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst), - types::I8X8 => { - if sign_extend == true { - Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst) - } else { - Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst) + (false, false) => { + if elem_ty.bytes() == 8 { + // Use a plain load. + ctx.emit(Inst::mov64_m_r(amode, dst)) + } else { + // Use a zero-extended load. + ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst)) + } + } + (_, true) => { + ctx.emit(match elem_ty { + types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst), + types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst), + types::I8X8 => { + if sign_extend == true { + Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst) + } else { + Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst) + } } - } - types::I16X4 => { - if sign_extend == true { - Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst) - } else { - Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst) + types::I16X4 => { + if sign_extend == true { + Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst) + } else { + Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst) + } } - } - types::I32X2 => { - if sign_extend == true { - Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst) - } else { - Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst) + types::I32X2 => { + if sign_extend == true { + Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst) + } else { + Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst) + } } - } - _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { - Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst) - } - // TODO Specialize for different types: MOVUPD, MOVDQU - _ => unreachable!( - "unexpected type for load: {:?} - {:?}", - elem_ty, - elem_ty.bits() - ), - }); + _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { + Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst) + } + // TODO Specialize for different types: MOVUPD, MOVDQU + _ => unreachable!( + "unexpected type for load: {:?} - {:?}", + elem_ty, + elem_ty.bits() + ), + }); + } } } } @@ -3491,17 +4472,23 @@ fn lower_insn_to_regs>( _ => unreachable!(), }; - let src = put_input_in_reg(ctx, inputs[0]); + if elem_ty == types::I128 { + let srcs = put_input_in_regs(ctx, inputs[0]); + ctx.emit(Inst::mov_r_m(8, srcs.regs()[0], addr.clone())); + ctx.emit(Inst::mov_r_m(8, srcs.regs()[1], addr.offset(8))); + } else { + let src = put_input_in_reg(ctx, inputs[0]); - ctx.emit(match elem_ty { - types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr), - types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr), - _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { - // TODO Specialize for different types: MOVUPD, MOVDQU, etc. - Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr) - } - _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr), - }); + ctx.emit(match elem_ty { + types::F32 => Inst::xmm_mov_r_m(SseOpcode::Movss, src, addr), + types::F64 => Inst::xmm_mov_r_m(SseOpcode::Movsd, src, addr), + _ if elem_ty.is_vector() && elem_ty.bits() == 128 => { + // TODO Specialize for different types: MOVUPD, MOVDQU, etc. + Inst::xmm_mov_r_m(SseOpcode::Movups, src, addr) + } + _ => Inst::mov_r_m(elem_ty.bytes() as u8, src, addr), + }); + } } Opcode::AtomicRmw => { @@ -3668,17 +4655,9 @@ fn lower_insn_to_regs>( }; let ty = ctx.output_ty(insn, 0); - let rhs = put_input_in_reg(ctx, rhs_input); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let lhs = if is_int_or_ref_ty(ty) && ty.bytes() < 4 { - // Special case: since the higher bits are undefined per CLIF semantics, we - // can just apply a 32-bit cmove here. Force inputs into registers, to - // avoid partial spilling out-of-bounds with memory accesses, though. - // Sign-extend operands to 32, then do a cmove of size 4. - RegMem::reg(put_input_in_reg(ctx, lhs_input)) - } else { - input_to_reg_mem(ctx, lhs_input) - }; + let rhs = put_input_in_regs(ctx, rhs_input); + let dst = get_output_reg(ctx, outputs[0]); + let lhs = put_input_in_regs(ctx, lhs_input); // We request inversion of Equal to NotEqual here: taking LHS if equal would mean // take it if both CC::NP and CC::Z are set, the conjunction of which can't be @@ -3691,15 +4670,20 @@ fn lower_insn_to_regs>( assert_eq!(cond_code, FloatCC::Equal); } - ctx.emit(Inst::gen_move(dst, rhs, ty)); + emit_moves(ctx, dst, rhs, ty); match fcmp_results { FcmpCondResult::Condition(cc) => { - if is_int_or_ref_ty(ty) { - let size = u8::max(ty.bytes() as u8, 4); - ctx.emit(Inst::cmove(size, cc, lhs, dst)); + if is_int_or_ref_ty(ty) || ty == types::I128 || ty == types::B128 { + let size = ty.bytes() as u8; + emit_cmoves(ctx, size, cc, lhs, dst); } else { - ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + ctx.emit(Inst::xmm_cmove( + ty == types::F64, + cc, + RegMem::reg(lhs.only_reg().unwrap()), + dst.only_reg().unwrap(), + )); } } FcmpCondResult::AndConditions(_, _) => { @@ -3709,40 +4693,37 @@ fn lower_insn_to_regs>( } FcmpCondResult::InvertedEqualOrConditions(cc1, cc2) | FcmpCondResult::OrConditions(cc1, cc2) => { - if is_int_or_ref_ty(ty) { - let size = u8::max(ty.bytes() as u8, 4); - ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst)); - ctx.emit(Inst::cmove(size, cc2, lhs, dst)); + if is_int_or_ref_ty(ty) || ty == types::I128 { + let size = ty.bytes() as u8; + emit_cmoves(ctx, size, cc1, lhs.clone(), dst); + emit_cmoves(ctx, size, cc2, lhs, dst); } else { - ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst)); - ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst)); + ctx.emit(Inst::xmm_cmove( + ty == types::F64, + cc1, + RegMem::reg(lhs.only_reg().unwrap()), + dst.only_reg().unwrap(), + )); + ctx.emit(Inst::xmm_cmove( + ty == types::F64, + cc2, + RegMem::reg(lhs.only_reg().unwrap()), + dst.only_reg().unwrap(), + )); } } } } else { let ty = ty.unwrap(); - let mut size = ty.bytes() as u8; - let lhs = if is_int_or_ref_ty(ty) { - if size < 4 { - // Special case: since the higher bits are undefined per CLIF semantics, we - // can just apply a 32-bit cmove here. Force inputs into registers, to - // avoid partial spilling out-of-bounds with memory accesses, though. - size = 4; - RegMem::reg(put_input_in_reg(ctx, inputs[1])) - } else { - input_to_reg_mem(ctx, inputs[1]) - } - } else { - input_to_reg_mem(ctx, inputs[1]) - }; - - let rhs = put_input_in_reg(ctx, inputs[2]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let size = ty.bytes() as u8; + let lhs = put_input_in_regs(ctx, inputs[1]); + let rhs = put_input_in_regs(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) { - emit_cmp(ctx, icmp); let cond_code = ctx.data(icmp).cond_code().unwrap(); + let cond_code = emit_cmp(ctx, icmp, cond_code); CC::from_intcc(cond_code) } else { let sel_ty = ctx.input_ty(insn, 0); @@ -3768,21 +4749,26 @@ fn lower_insn_to_regs>( }; // This doesn't affect the flags. - ctx.emit(Inst::gen_move(dst, rhs, ty)); + emit_moves(ctx, dst, rhs, ty); - if is_int_or_ref_ty(ty) { - ctx.emit(Inst::cmove(size, cc, lhs, dst)); + if is_int_or_ref_ty(ty) || ty == types::I128 { + emit_cmoves(ctx, size, cc, lhs, dst); } else { debug_assert!(ty == types::F32 || ty == types::F64); - ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + ctx.emit(Inst::xmm_cmove( + ty == types::F64, + cc, + RegMem::reg(lhs.only_reg().unwrap()), + dst.only_reg().unwrap(), + )); } } } Opcode::Selectif | Opcode::SelectifSpectreGuard => { - let lhs = input_to_reg_mem(ctx, inputs[1]); - let rhs = put_input_in_reg(ctx, inputs[2]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let lhs = put_input_in_regs(ctx, inputs[1]); + let rhs = put_input_in_regs(ctx, inputs[2]); + let dst = get_output_reg(ctx, outputs[0]); let ty = ctx.output_ty(insn, 0); // Verification ensures that the input is always a single-def ifcmp. @@ -3792,26 +4778,24 @@ fn lower_insn_to_regs>( .unwrap() .0; debug_assert_eq!(ctx.data(cmp_insn).opcode(), Opcode::Ifcmp); - emit_cmp(ctx, cmp_insn); + let cond_code = ctx.data(insn).cond_code().unwrap(); + let cond_code = emit_cmp(ctx, cmp_insn, cond_code); - let cc = CC::from_intcc(ctx.data(insn).cond_code().unwrap()); + let cc = CC::from_intcc(cond_code); - if is_int_or_ref_ty(ty) { + if is_int_or_ref_ty(ty) || ty == types::I128 { let size = ty.bytes() as u8; - if size == 1 { - // Sign-extend operands to 32, then do a cmove of size 4. - let lhs_se = ctx.alloc_tmp(types::I32).only_reg().unwrap(); - ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se)); - ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst)); - ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst)); - } else { - ctx.emit(Inst::gen_move(dst, rhs, ty)); - ctx.emit(Inst::cmove(size, cc, lhs, dst)); - } + emit_moves(ctx, dst, rhs, ty); + emit_cmoves(ctx, size, cc, lhs, dst); } else { debug_assert!(ty == types::F32 || ty == types::F64); - ctx.emit(Inst::gen_move(dst, rhs, ty)); - ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst)); + emit_moves(ctx, dst, rhs, ty); + ctx.emit(Inst::xmm_cmove( + ty == types::F64, + cc, + RegMem::reg(lhs.only_reg().unwrap()), + dst.only_reg().unwrap(), + )); } } @@ -3894,8 +4878,19 @@ fn lower_insn_to_regs>( // The quotient is in rax. ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty)); } else { - // The remainder is in rdx. - ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty)); + if size == 1 { + // The remainder is in AH. Right-shift by 8 bits then move from rax. + ctx.emit(Inst::shift_r( + 8, + ShiftKind::ShiftRightLogical, + Some(8), + Writable::from_reg(regs::rax()), + )); + ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty)); + } else { + // The remainder is in rdx. + ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty)); + } } } @@ -4297,6 +5292,38 @@ fn lower_insn_to_regs>( } } + Opcode::Iconcat => { + let ty = ctx.output_ty(insn, 0); + assert_eq!( + ty, + types::I128, + "Iconcat not expected to be used for non-128-bit type" + ); + assert_eq!(ctx.input_ty(insn, 0), types::I64); + assert_eq!(ctx.input_ty(insn, 1), types::I64); + let lo = put_input_in_reg(ctx, inputs[0]); + let hi = put_input_in_reg(ctx, inputs[1]); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::gen_move(dst.regs()[0], lo, types::I64)); + ctx.emit(Inst::gen_move(dst.regs()[1], hi, types::I64)); + } + + Opcode::Isplit => { + let ty = ctx.input_ty(insn, 0); + assert_eq!( + ty, + types::I128, + "Iconcat not expected to be used for non-128-bit type" + ); + assert_eq!(ctx.output_ty(insn, 0), types::I64); + assert_eq!(ctx.output_ty(insn, 1), types::I64); + let src = put_input_in_regs(ctx, inputs[0]); + let dst_lo = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + let dst_hi = get_output_reg(ctx, outputs[1]).only_reg().unwrap(); + ctx.emit(Inst::gen_move(dst_lo, src.regs()[0], types::I64)); + ctx.emit(Inst::gen_move(dst_hi, src.regs()[1], types::I64)); + } + Opcode::IaddImm | Opcode::ImulImm | Opcode::UdivImm @@ -4384,9 +5411,9 @@ impl LowerBackend for X64Backend { let src_ty = ctx.input_ty(branches[0], 0); if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) { - emit_cmp(ctx, icmp); - let cond_code = ctx.data(icmp).cond_code().unwrap(); + let cond_code = emit_cmp(ctx, icmp, cond_code); + let cond_code = if op0 == Opcode::Brz { cond_code.inverse() } else { @@ -4416,6 +5443,32 @@ impl LowerBackend for X64Backend { } FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(), } + } else if src_ty == types::I128 { + let src = put_input_in_regs( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + ); + let (half_cc, comb_op) = match op0 { + Opcode::Brz => (CC::Z, AluRmiROpcode::And8), + Opcode::Brnz => (CC::NZ, AluRmiROpcode::Or8), + _ => unreachable!(), + }; + let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap(); + ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.regs()[0])); + ctx.emit(Inst::setcc(half_cc, tmp1)); + ctx.emit(Inst::cmp_rmi_r(8, RegMemImm::imm(0), src.regs()[1])); + ctx.emit(Inst::setcc(half_cc, tmp2)); + ctx.emit(Inst::alu_rmi_r( + false, + comb_op, + RegMemImm::reg(tmp1.to_reg()), + tmp2, + )); + ctx.emit(Inst::jmp_cond(CC::NZ, taken, not_taken)); } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) { let src = put_input_in_reg( ctx, @@ -4483,8 +5536,8 @@ impl LowerBackend for X64Backend { }; if let Some(ifcmp) = matches_input(ctx, flag_input, Opcode::Ifcmp) { - emit_cmp(ctx, ifcmp); let cond_code = ctx.data(branches[0]).cond_code().unwrap(); + let cond_code = emit_cmp(ctx, ifcmp, cond_code); let cc = CC::from_intcc(cond_code); ctx.emit(Inst::jmp_cond(cc, taken, not_taken)); } else if let Some(ifcmp_sp) = matches_input(ctx, flag_input, Opcode::IfcmpSp) { diff --git a/cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif b/cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif new file mode 100644 index 0000000000..5795900438 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/bitops-i128-run.clif @@ -0,0 +1,27 @@ +test run +target x86_64 +feature "experimental_x64" + +function %ctz(i64, i64) -> i8 { +block0(v0: i64, v1: i64): + v2 = iconcat v0, v1 + v3 = ctz.i128 v2 + v4 = ireduce.i8 v3 + return v4 +} +; run: %ctz(0x00000000_00000000, 0x00000001_00000000) == 96 +; run: %ctz(0x00000000_00010000, 0x00000001_00000000) == 16 +; run: %ctz(0x00000000_00010000, 0x00000000_00000000) == 16 +; run: %ctz(0x00000000_00000000, 0x00000000_00000000) == 128 + +function %clz(i64, i64) -> i8 { +block0(v0: i64, v1: i64): + v2 = iconcat v0, v1 + v3 = clz.i128 v2 + v4 = ireduce.i8 v3 + return v4 +} +; run: %clz(0x00000000_00000000, 0x00000001_00000000) == 31 +; run: %clz(0x00000000_00010000, 0x00000001_00000000) == 31 +; run: %clz(0x00000000_00010000, 0x00000000_00000000) == 111 +; run: %clz(0x00000000_00000000, 0x00000000_00000000) == 128 diff --git a/cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif b/cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif new file mode 100644 index 0000000000..64ea96716c --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/bitrev-i128-run.clif @@ -0,0 +1,47 @@ +test run +target x86_64 +feature "experimental_x64" + +function %reverse_bits_zero() -> b1 { +block0: + v0 = iconst.i64 0 + v1 = iconcat v0, v0 + v2 = bitrev.i128 v1 + v3 = icmp eq v2, v1 + return v3 +} +; run + +function %reverse_bits_one() -> b1 { +block0: + v0 = iconst.i64 0 + v1 = iconst.i64 1 + v2 = iconcat v0, v1 + + v3 = bitrev.i128 v2 + + v4 = iconst.i64 0x8000_0000_0000_0000 + v5 = iconst.i64 0 + v6 = iconcat v4, v5 + + v7 = icmp eq v3, v6 + return v7 +} +; run + +function %reverse_bits() -> b1 { +block0: + v0 = iconst.i64 0x06AD_8667_69EC_41BA + v1 = iconst.i64 0x6C83_D81A_6E28_83AB + v2 = iconcat v0, v1 + + v3 = bitrev.i128 v2 + + v4 = iconst.i64 0xD5C11476581BC136 + v5 = iconst.i64 0x5D823796E661B560 + v6 = iconcat v4, v5 + + v7 = icmp eq v3, v6 + return v7 +} +; run diff --git a/cranelift/filetests/filetests/isa/x64/floating-point.clif b/cranelift/filetests/filetests/isa/x64/floating-point.clif new file mode 100644 index 0000000000..b3b5907210 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/floating-point.clif @@ -0,0 +1,26 @@ +test compile +target x86_64 +feature "experimental_x64" + +function %f(f64) -> f64 { +block0(v0: f64): + v1 = fabs.f64 v0 + return v1 +} +; check: movabsq $$9223372036854775807, %rsi +; nextln: movq %rsi, %xmm1 +; nextln: andpd %xmm0, %xmm1 +; nextln: movaps %xmm1, %xmm0 + + +function %f(i64) -> f64 { +block0(v0: i64): + v1 = load.f64 v0 + v2 = fabs.f64 v1 + return v2 +} +; check: movsd 0(%rdi), %xmm0 +; nextln: movabsq $$9223372036854775807, %rsi +; nextln: movq %rsi, %xmm1 +; nextln: andpd %xmm0, %xmm1 +; nextln: movaps %xmm1, %xmm0 diff --git a/cranelift/filetests/filetests/isa/x64/i128.clif b/cranelift/filetests/filetests/isa/x64/i128.clif new file mode 100644 index 0000000000..e7ee34f283 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/i128.clif @@ -0,0 +1,1082 @@ +test compile +target x86_64 +feature "experimental_x64" + +function %f0(i128, i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i128): + + v2 = iadd v0, v1 +; nextln: addq %rdx, %rdi +; nextln: adcq %rcx, %rsi + + return v2 +; nextln: movq %rdi, %rax +; nextln: movq %rsi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f1(i128, i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i128): + + v2 = isub v0, v1 +; nextln: subq %rdx, %rdi +; nextln: sbbq %rcx, %rsi + + return v2 +; nextln: movq %rdi, %rax +; nextln: movq %rsi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f2(i128, i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i128): + + v2 = band v0, v1 +; nextln: andq %rdx, %rdi +; nextln: andq %rcx, %rsi + + return v2 +; nextln: movq %rdi, %rax +; nextln: movq %rsi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f3(i128, i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i128): + + v2 = bor v0, v1 +; nextln: orq %rdx, %rdi +; nextln: orq %rcx, %rsi + + return v2 +; nextln: movq %rdi, %rax +; nextln: movq %rsi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f4(i128, i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i128): + + v2 = bxor v0, v1 +; nextln: xorq %rdx, %rdi +; nextln: xorq %rcx, %rsi + + return v2 +; nextln: movq %rdi, %rax +; nextln: movq %rsi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f5(i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128): + + v1 = bnot v0 +; nextln: notq %rdi +; nextln: notq %rsi + + return v1 +; nextln: movq %rdi, %rax +; nextln: movq %rsi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f6(i128, i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i128): +; v0 in rdi:rsi, v1 in rdx:rcx + + v2 = imul v0, v1 +; nextln: movq %rsi, %rax +; nextln: movq %rcx, %r8 +; nextln: movq %rdi, %rsi +; nextln: imulq %rdx, %rsi +; nextln: movq %rdi, %rcx +; nextln: imulq %r8, %rcx +; nextln: imulq %rdx, %rax +; nextln: addq %rax, %rcx +; nextln: movq %rdi, %rax +; nextln: mul %rdx +; nextln: addq %rdx, %rcx +; nextln: movq %rsi, %rax +; nextln: movq %rcx, %rdx + + return v2 +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f7(i64, i64) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i64, v1: i64): + v2 = iconcat.i64 v0, v1 +; nextln: movq %rdi, %rax +; nextln: movq %rsi, %rdx + + return v2 +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f8(i128) -> i64, i64 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128): + v1, v2 = isplit.i128 v0 +; nextln: movq %rdi, %rax +; nextln: movq %rsi, %rdx + + return v1, v2 +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f9(i128, i128) -> b1 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i128): + v2 = icmp eq v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setz %al +; nextln: cmpq %rdx, %rdi +; nextln: setz %r8b +; nextln: andq %rax, %r8 +; nextln: andq $$1, %r8 +; nextln: setnz %al + + v3 = icmp ne v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setnz %al +; nextln: cmpq %rdx, %rdi +; nextln: setnz %r8b +; nextln: orq %rax, %r8 +; nextln: andq $$1, %r8 +; nextln: setnz %r8b + + v4 = icmp slt v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setl %r9b +; nextln: setz %al +; nextln: cmpq %rdx, %rdi +; nextln: setb %r10b +; nextln: andq %rax, %r10 +; nextln: orq %r9, %r10 +; nextln: andq $$1, %r10 +; nextln: setnz %r9b + + v5 = icmp sle v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setl %r10b +; nextln: setz %al +; nextln: cmpq %rdx, %rdi +; nextln: setbe %r11b +; nextln: andq %rax, %r11 +; nextln: orq %r10, %r11 +; nextln: andq $$1, %r11 +; nextln: setnz %r10b + + v6 = icmp sgt v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setnle %r11b +; nextln: setz %al +; nextln: cmpq %rdx, %rdi +; nextln: setnbe %r12b +; nextln: andq %rax, %r12 +; nextln: orq %r11, %r12 +; nextln: andq $$1, %r12 +; nextln: setnz %r11b + + v7 = icmp sge v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setnle %r12b +; nextln: setz %al +; nextln: cmpq %rdx, %rdi +; nextln: setnb %r13b +; nextln: andq %rax, %r13 +; nextln: orq %r12, %r13 +; nextln: andq $$1, %r13 +; nextln: setnz %r12b + + v8 = icmp ult v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setb %r13b +; nextln: setz %al +; nextln: cmpq %rdx, %rdi +; nextln: setb %r14b +; nextln: andq %rax, %r14 +; nextln: orq %r13, %r14 +; nextln: andq $$1, %r14 +; nextln: setnz %r13b + + v9 = icmp ule v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setb %r14b +; nextln: setz %al +; nextln: cmpq %rdx, %rdi +; nextln: setbe %bl +; nextln: andq %rax, %rbx +; nextln: orq %r14, %rbx +; nextln: andq $$1, %rbx +; nextln: setnz %r14b + + v10 = icmp ugt v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setnbe %bl +; nextln: setz %r15b +; nextln: cmpq %rdx, %rdi +; nextln: setnbe %al +; nextln: andq %r15, %rax +; nextln: orq %rbx, %rax +; nextln: andq $$1, %rax +; nextln: setnz %bl + + v11 = icmp uge v0, v1 +; check: cmpq %rcx, %rsi +; nextln: setnbe %sil +; nextln: setz %cl +; nextln: cmpq %rdx, %rdi +; nextln: setnb %dil +; nextln: andq %rcx, %rdi +; nextln: orq %rsi, %rdi +; nextln: andq $$1, %rdi +; nextln: setnz %sil + + v12 = band v2, v3 + v13 = band v4, v5 + v14 = band v6, v7 + v15 = band v8, v9 + v16 = band v10, v11 + v17 = band v12, v13 + v18 = band v14, v15 + v19 = band v17, v18 + v20 = band v19, v16 + + return v20 +; check: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f10(i128) -> i32 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128): + brz v0, block1 +; check: cmpq $$0, %rdi +; nextln: setz %dil +; nextln: cmpq $$0, %rsi +; nextln: setz %sil +; nextln: andb %dil, %sil +; nextln: jnz label1; j label2 + + jump block2 + +block1: + v1 = iconst.i32 1 + return v1 + +block2: + v2 = iconst.i32 2 + return v2 + +; check: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f11(i128) -> i32 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128): + brnz v0, block1 +; check: cmpq $$0, %rdi +; nextln: setnz %dil +; nextln: cmpq $$0, %rsi +; nextln: setnz %sil +; nextln: orb %dil, %sil +; nextln: jnz label1; j label2 + jump block2 + +block1: + v1 = iconst.i32 1 + return v1 + +block2: + v2 = iconst.i32 2 + return v2 + +; check: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f12(i64) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i64): + v1 = uextend.i128 v0 + return v1 + +; nextln: movq %rdi, %rsi +; nextln: xorq %rdi, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f13(i64) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i64): + v1 = sextend.i128 v0 + return v1 + +; nextln: movq %rdi, %rsi +; nextln: movq %rsi, %rdi +; nextln: sarq $$63, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f14(i8) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i8): + v1 = sextend.i128 v0 + return v1 + +; nextln: movsbq %dil, %rsi +; nextln: movq %rsi, %rdi +; nextln: sarq $$63, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f15(i8) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i8): + v1 = uextend.i128 v0 + return v1 + +; nextln: movzbq %dil, %rsi +; nextln: xorq %rdi, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret + +} + +function %f16(i128) -> i64 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128): + v1 = ireduce.i64 v0 + return v1 + +; nextln: movq %rdi, %rax + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f17(i128) -> i8 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128): + v1 = ireduce.i8 v0 + return v1 + +; nextln: movq %rdi, %rax + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f18(b1) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: b1): + v1 = bint.i128 v0 + return v1 + +; check: movzbq %dil, %rsi +; nextln: xorq %rdi, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f19(i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128): + v1 = popcnt.i128 v0 + return v1 + +; check: movq %rsi, %rdx +; nextln: movq %rdi, %rsi +; nextln: shrq $$1, %rsi +; nextln: movabsq $$8608480567731124087, %rcx +; nextln: andq %rcx, %rsi +; nextln: movq %rdi, %rax +; nextln: subq %rsi, %rax +; nextln: shrq $$1, %rsi +; nextln: andq %rcx, %rsi +; nextln: subq %rsi, %rax +; nextln: shrq $$1, %rsi +; nextln: andq %rcx, %rsi +; nextln: subq %rsi, %rax +; nextln: movq %rax, %rsi +; nextln: shrq $$4, %rsi +; nextln: addq %rax, %rsi +; nextln: movabsq $$1085102592571150095, %rdi +; nextln: andq %rdi, %rsi +; nextln: movabsq $$72340172838076673, %rdi +; nextln: imulq %rdi, %rsi +; nextln: shrq $$56, %rsi +; nextln: movq %rdx, %rax +; nextln: shrq $$1, %rax +; nextln: movabsq $$8608480567731124087, %rcx +; nextln: andq %rcx, %rax +; nextln: movq %rdx, %rdi +; nextln: subq %rax, %rdi +; nextln: shrq $$1, %rax +; nextln: andq %rcx, %rax +; nextln: subq %rax, %rdi +; nextln: shrq $$1, %rax +; nextln: andq %rcx, %rax +; nextln: subq %rax, %rdi +; nextln: movq %rdi, %rax +; nextln: shrq $$4, %rax +; nextln: addq %rdi, %rax +; nextln: movabsq $$1085102592571150095, %rdi +; nextln: andq %rdi, %rax +; nextln: movabsq $$72340172838076673, %rdi +; nextln: imulq %rdi, %rax +; nextln: shrq $$56, %rax +; nextln: addq %rax, %rsi +; nextln: xorq %rdi, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx + + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + + +function %f20(i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128): + v1 = bitrev.i128 v0 + return v1 + +; check: movq %rdi, %rcx +; nextln: movq %rcx, %rdi +; nextln: movabsq $$6148914691236517205, %rax +; nextln: shrq $$1, %rdi +; nextln: andq %rax, %rdi +; nextln: andq %rcx, %rax +; nextln: shlq $$1, %rax +; nextln: movq %rax, %rcx +; nextln: orq %rdi, %rcx +; nextln: movq %rcx, %rdi +; nextln: movabsq $$3689348814741910323, %rax +; nextln: shrq $$2, %rdi +; nextln: andq %rax, %rdi +; nextln: andq %rcx, %rax +; nextln: shlq $$2, %rax +; nextln: movq %rax, %rcx +; nextln: orq %rdi, %rcx +; nextln: movq %rcx, %rdi +; nextln: movabsq $$1085102592571150095, %rax +; nextln: shrq $$4, %rdi +; nextln: andq %rax, %rdi +; nextln: andq %rcx, %rax +; nextln: shlq $$4, %rax +; nextln: movq %rax, %rcx +; nextln: orq %rdi, %rcx +; nextln: movq %rcx, %rdi +; nextln: movabsq $$71777214294589695, %rax +; nextln: shrq $$8, %rdi +; nextln: andq %rax, %rdi +; nextln: andq %rcx, %rax +; nextln: shlq $$8, %rax +; nextln: movq %rax, %rcx +; nextln: orq %rdi, %rcx +; nextln: movq %rcx, %rdi +; nextln: movabsq $$281470681808895, %rax +; nextln: shrq $$16, %rdi +; nextln: andq %rax, %rdi +; nextln: andq %rcx, %rax +; nextln: shlq $$16, %rax +; nextln: orq %rdi, %rax +; nextln: movq %rax, %rcx +; nextln: movl $$-1, %edi +; nextln: shrq $$32, %rcx +; nextln: andq %rdi, %rcx +; nextln: andq %rax, %rdi +; nextln: shlq $$32, %rdi +; nextln: orq %rcx, %rdi +; nextln: movq %rsi, %rcx +; nextln: movq %rcx, %rsi +; nextln: movabsq $$6148914691236517205, %rax +; nextln: shrq $$1, %rsi +; nextln: andq %rax, %rsi +; nextln: andq %rcx, %rax +; nextln: shlq $$1, %rax +; nextln: movq %rax, %rcx +; nextln: orq %rsi, %rcx +; nextln: movq %rcx, %rsi +; nextln: movabsq $$3689348814741910323, %rax +; nextln: shrq $$2, %rsi +; nextln: andq %rax, %rsi +; nextln: andq %rcx, %rax +; nextln: shlq $$2, %rax +; nextln: movq %rax, %rcx +; nextln: orq %rsi, %rcx +; nextln: movq %rcx, %rsi +; nextln: movabsq $$1085102592571150095, %rax +; nextln: shrq $$4, %rsi +; nextln: andq %rax, %rsi +; nextln: andq %rcx, %rax +; nextln: shlq $$4, %rax +; nextln: movq %rax, %rcx +; nextln: orq %rsi, %rcx +; nextln: movq %rcx, %rsi +; nextln: movabsq $$71777214294589695, %rax +; nextln: shrq $$8, %rsi +; nextln: andq %rax, %rsi +; nextln: andq %rcx, %rax +; nextln: shlq $$8, %rax +; nextln: movq %rax, %rcx +; nextln: orq %rsi, %rcx +; nextln: movq %rcx, %rsi +; nextln: movabsq $$281470681808895, %rax +; nextln: shrq $$16, %rsi +; nextln: andq %rax, %rsi +; nextln: andq %rcx, %rax +; nextln: shlq $$16, %rax +; nextln: orq %rsi, %rax +; nextln: movq %rax, %rsi +; nextln: movl $$-1, %ecx +; nextln: shrq $$32, %rsi +; nextln: andq %rcx, %rsi +; nextln: andq %rax, %rcx +; nextln: shlq $$32, %rcx +; nextln: orq %rsi, %rcx +; nextln: movq %rcx, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f21(i128, i32) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i32): + v2 = ushr v0, v1 + return v2 + +; check: movq %rdi, %rax +; nextln: movq %rsi, %rdi +; nextln: movq %rdi, %rsi +; nextln: movq %rdx, %rcx +; nextln: shrq %cl, %rsi +; nextln: movq %rdx, %rcx +; nextln: shrq %cl, %rax +; nextln: movl $$64, %ecx +; nextln: subq %rdx, %rcx +; nextln: shlq %cl, %rdi +; nextln: orq %rax, %rdi +; nextln: xorq %rax, %rax +; nextln: xorq %rcx, %rcx +; nextln: andq $$64, %rdx +; nextln: cmovzq %rsi, %rax +; nextln: cmovzq %rdi, %rcx +; nextln: cmovnzq %rsi, %rcx +; nextln: movq %rax, %rdx +; nextln: movq %rcx, %rax + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f22(i128, i32) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i32): + v2 = ishl v0, v1 + return v2 + +; check: movq %rsi, %rax +; nextln: movq %rdi, %rsi +; nextln: movq %rdx, %rcx +; nextln: shlq %cl, %rsi +; nextln: movq %rdx, %rcx +; nextln: shlq %cl, %rax +; nextln: movl $$64, %ecx +; nextln: subq %rdx, %rcx +; nextln: shrq %cl, %rdi +; nextln: orq %rax, %rdi +; nextln: xorq %rax, %rax +; nextln: xorq %rcx, %rcx +; nextln: andq $$64, %rdx +; nextln: cmovzq %rdi, %rcx +; nextln: cmovzq %rsi, %rax +; nextln: cmovnzq %rsi, %rcx +; nextln: movq %rcx, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f23(i128, i32) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i32): + v2 = sshr v0, v1 + return v2 + +; check: movq %rdi, %r8 +; nextln: movq %rsi, %rdi +; nextln: movq %rdi, %rsi +; nextln: movq %rdx, %rcx +; nextln: sarq %cl, %rsi +; nextln: movq %rdx, %rcx +; nextln: sarq %cl, %r8 +; nextln: movl $$64, %ecx +; nextln: subq %rdx, %rcx +; nextln: movq %rdi, %rax +; nextln: shlq %cl, %rax +; nextln: orq %r8, %rax +; nextln: sarq $$63, %rdi +; nextln: xorq %rcx, %rcx +; nextln: andq $$64, %rdx +; nextln: cmovzq %rsi, %rdi +; nextln: cmovzq %rax, %rcx +; nextln: cmovnzq %rsi, %rcx +; nextln: movq %rcx, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f24(i128, i32) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i32): + v2 = rotr.i128 v0, v1 + return v2 + +; check: movq %rsi, %r9 +; nextln: movq %rdx, %rcx +; nextln: shrq %cl, %r9 +; nextln: movq %rdi, %rax +; nextln: movq %rdx, %rcx +; nextln: shrq %cl, %rax +; nextln: movl $$64, %ecx +; nextln: subq %rdx, %rcx +; nextln: movq %rsi, %r10 +; nextln: shlq %cl, %r10 +; nextln: orq %rax, %r10 +; nextln: xorq %r8, %r8 +; nextln: xorq %rax, %rax +; nextln: movq %rdx, %rcx +; nextln: andq $$64, %rcx +; nextln: cmovzq %r9, %r8 +; nextln: cmovzq %r10, %rax +; nextln: cmovnzq %r9, %rax +; nextln: movl $$128, %r9d +; nextln: subq %rdx, %r9 +; nextln: movq %rdi, %rdx +; nextln: movq %r9, %rcx +; nextln: shlq %cl, %rdx +; nextln: movq %r9, %rcx +; nextln: shlq %cl, %rsi +; nextln: movl $$64, %ecx +; nextln: subq %r9, %rcx +; nextln: movq %rdi, %r10 +; nextln: shrq %cl, %r10 +; nextln: orq %rsi, %r10 +; nextln: xorq %rsi, %rsi +; nextln: xorq %rdi, %rdi +; nextln: andq $$64, %r9 +; nextln: cmovzq %r10, %rdi +; nextln: cmovzq %rdx, %rsi +; nextln: cmovnzq %rdx, %rdi +; nextln: orq %rax, %rsi +; nextln: orq %r8, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f25(i128, i32) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i32): + v2 = rotl.i128 v0, v1 + return v2 + +; check: movq %rdi, %r9 +; nextln: movq %rdx, %rcx +; nextln: shlq %cl, %r9 +; nextln: movq %rsi, %rax +; nextln: movq %rdx, %rcx +; nextln: shlq %cl, %rax +; nextln: movl $$64, %ecx +; nextln: subq %rdx, %rcx +; nextln: movq %rdi, %r10 +; nextln: shrq %cl, %r10 +; nextln: orq %rax, %r10 +; nextln: xorq %r8, %r8 +; nextln: xorq %rax, %rax +; nextln: movq %rdx, %rcx +; nextln: andq $$64, %rcx +; nextln: cmovzq %r10, %rax +; nextln: cmovzq %r9, %r8 +; nextln: cmovnzq %r9, %rax +; nextln: movl $$128, %r9d +; nextln: subq %rdx, %r9 +; nextln: movq %rsi, %rdx +; nextln: movq %r9, %rcx +; nextln: shrq %cl, %rdx +; nextln: movq %r9, %rcx +; nextln: shrq %cl, %rdi +; nextln: movl $$64, %ecx +; nextln: subq %r9, %rcx +; nextln: shlq %cl, %rsi +; nextln: orq %rdi, %rsi +; nextln: xorq %rdi, %rdi +; nextln: xorq %rcx, %rcx +; nextln: andq $$64, %r9 +; nextln: cmovzq %rdx, %rdi +; nextln: cmovzq %rsi, %rcx +; nextln: cmovnzq %rdx, %rcx +; nextln: orq %r8, %rcx +; nextln: orq %rax, %rdi +; nextln: movq %rcx, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f26(i128, i64) { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128, v1: i64): + store.i128 v0, v1 + return + +; check: movq %rdi, 0(%rdx) +; nextln: movq %rsi, 8(%rdx) + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f27(i64) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i64): + v1 = load.i128 v0 + return v1 + +; check: movq 0(%rdi), %rsi +; nextln: movq 8(%rdi), %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +} + +function %f28(i128, b1) -> i128 { +block0(v0: i128, v1: b1): + v2 = iconst.i128 0 + brnz v1, block1(v2) + jump block2(v2) + +block1(v3: i128): + v4 = iconst.i128 1 + v5 = iadd.i128 v3, v4 + return v5 + +block2(v6: i128): + v7 = iconst.i128 2 + v8 = iadd.i128 v6, v7 + return v8 + +; check: pushq %rbp +; nextln: movq %rsp, %rbp +; nextln: testb $$1, %dl +; nextln: jnz label1; j label2 +; check: Block 1: +; check: movl $$0, %esi +; nextln: movl $$0, %edi +; nextln: movl $$1, %eax +; nextln: movl $$0, %ecx +; nextln: addq %rax, %rsi +; nextln: adcq %rcx, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret +; check: Block 2: +; check: movl $$0, %esi +; nextln: movl $$0, %edi +; nextln: movl $$2, %eax +; nextln: movl $$0, %ecx +; nextln: addq %rax, %rsi +; nextln: adcq %rcx, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret + +} + +function %f29(i128, i128, i64, i128, i128, i128) -> i128 { + +block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128): + v6 = iadd.i128 v0, v1 + v7 = uextend.i128 v2 + v8 = iadd.i128 v3, v7 + v9 = iadd.i128 v4, v5 + v10 = iadd.i128 v6, v8 + v11 = iadd.i128 v9, v10 + return v11 + +; check: movq %rsp, %rbp +; nextln: subq $$16, %rsp +; nextln: movq %r12, 0(%rsp) +; nextln: movq %r13, 8(%rsp) +; nextln: virtual_sp_offset_adjust 16 +; nextln: movq 16(%rbp), %r9 +; nextln: movq 24(%rbp), %r10 +; nextln: movq 32(%rbp), %r12 +; nextln: movq 40(%rbp), %r11 +; nextln: movq 48(%rbp), %rax +; nextln: movq 56(%rbp), %r13 +; nextln: addq %rdx, %rdi +; nextln: adcq %rcx, %rsi +; nextln: xorq %rcx, %rcx +; nextln: addq %r8, %r9 +; nextln: adcq %rcx, %r10 +; nextln: addq %rax, %r12 +; nextln: adcq %r13, %r11 +; nextln: addq %r9, %rdi +; nextln: adcq %r10, %rsi +; nextln: addq %rdi, %r12 +; nextln: adcq %rsi, %r11 +; nextln: movq %r12, %rax +; nextln: movq %r11, %rdx +; nextln: movq 0(%rsp), %r12 +; nextln: movq 8(%rsp), %r13 +; nextln: addq $$16, %rsp +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret + +} + +function %f30(i128) -> i128, i128, i128, i64, i128, i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i128): + v1 = ireduce.i64 v0 + return v0, v0, v0, v1, v0, v0 + +; likely to change with regalloc -- just check the stores into the retval area: + +; check: movq %r8, 0(%rsi) +; nextln: movq %r9, 8(%rsi) +; nextln: movq %r10, 16(%rsi) +; nextln: movq %r11, 24(%rsi) +; nextln: movq %r12, 32(%rsi) +; nextln: movq %r13, 48(%rsi) +; nextln: movq %r14, 56(%rsi) +; nextln: movq %rdi, 64(%rsi) +; nextln: movq %rbx, 72(%rsi) + +} + +function %f31(i128, i128) -> i128, i128 { + fn0 = %g(i128, i128) -> i128, i128 +block0(v0: i128, v1: i128): + v2, v3 = call fn0(v0, v1) + return v2, v3 + +; check: pushq %rbp +; nextln: movq %rsp, %rbp +; nextln: subq $$16, %rsp +; nextln: movq %r12, 0(%rsp) +; nextln: virtual_sp_offset_adjust 8 +; nextln: movq %r8, %r12 +; nextln: subq $$16, %rsp +; nextln: virtual_sp_offset_adjust 16 +; nextln: lea 0(%rsp), %r8 +; nextln: load_ext_name %g+0, %rax +; nextln: call *%rax +; nextln: movq 0(%rsp), %rsi +; nextln: movq 8(%rsp), %rdi +; nextln: addq $$16, %rsp +; nextln: virtual_sp_offset_adjust -16 +; nextln: movq %rsi, 0(%r12) +; nextln: movq %rdi, 8(%r12) +; nextln: movq 0(%rsp), %r12 +; nextln: addq $$16, %rsp +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret + +} + +function %f32(i128) -> i128 { +block0(v0: i128): + v1 = clz.i128 v0 + return v1 + +; check: pushq %rbp +; nextln: movq %rsp, %rbp +; nextln: movabsq $$-1, %rcx +; nextln: bsrq %rsi, %rax +; nextln: cmovzq %rcx, %rax +; nextln: movl $$63, %esi +; nextln: subq %rax, %rsi +; nextln: movabsq $$-1, %rcx +; nextln: bsrq %rdi, %rax +; nextln: cmovzq %rcx, %rax +; nextln: movl $$63, %edi +; nextln: subq %rax, %rdi +; nextln: addq $$64, %rdi +; nextln: cmpq $$64, %rsi +; nextln: cmovnzq %rsi, %rdi +; nextln: xorq %rsi, %rsi +; nextln: movq %rdi, %rax +; nextln: movq %rsi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret + +} + +function %f33(i128) -> i128 { +block0(v0: i128): + v1 = ctz.i128 v0 + return v1 +} + +; check: pushq %rbp +; nextln: movq %rsp, %rbp +; nextln: movq %rsi, %rax +; nextln: movl $$64, %ecx +; nextln: bsfq %rdi, %rsi +; nextln: cmovzq %rcx, %rsi +; nextln: movl $$64, %ecx +; nextln: bsfq %rax, %rdi +; nextln: cmovzq %rcx, %rdi +; nextln: addq $$64, %rdi +; nextln: cmpq $$64, %rsi +; nextln: cmovzq %rdi, %rsi +; nextln: xorq %rdi, %rdi +; nextln: movq %rsi, %rax +; nextln: movq %rdi, %rdx +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret diff --git a/cranelift/filetests/filetests/isa/x64/select-i128.clif b/cranelift/filetests/filetests/isa/x64/select-i128.clif new file mode 100644 index 0000000000..3492a71997 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/select-i128.clif @@ -0,0 +1,29 @@ +test compile +target x86_64 +feature "experimental_x64" + +function %f0(i32, i128, i128) -> i128 { +; check: pushq %rbp +; nextln: movq %rsp, %rbp + +block0(v0: i32, v1: i128, v2: i128): + + v3 = iconst.i32 42 + v4 = icmp.i32 eq v0, v3 +; nextln: movl $$42, %eax +; nextln: cmpl %eax, %edi + + v5 = select.i128 v4, v1, v2 +; nextln: cmovzq %rsi, %rcx +; nextln: cmovzq %rdx, %r8 + + return v5 +; nextln: movq %rcx, %rax +; nextln: movq %r8, %rdx + +; nextln: movq %rbp, %rsp +; nextln: popq %rbp +; nextln: ret + +} + diff --git a/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif new file mode 100644 index 0000000000..37bc4667e7 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif @@ -0,0 +1,106 @@ +test run +target x86_64 +feature "experimental_x64" + +function %ishl1() -> b1 { +block0: + v0 = iconst.i64 0x01010101_01010101 + v1 = iconcat v0, v0 + v2 = iconst.i32 2 + v3 = ishl.i128 v1, v2 + v4 = iconst.i64 0x04040404_04040404 + v5 = iconcat v4, v4 + v6 = icmp eq v3, v5 + return v6 +} +; run + +function %ishl2() -> b1 { +block0: + v0 = iconst.i64 0x01010101_01010101 + v1 = iconst.i64 0x01010101_01010101 + v2 = iconcat v0, v1 + v3 = iconst.i32 9 + v4 = ishl.i128 v2, v3 + v5 = iconst.i64 0x02020202_02020200 + v6 = iconst.i64 0x02020202_02020202 + v7 = iconcat v5, v6 + v8 = icmp eq v4, v7 + return v8 +} +; run + +function %ishl3() -> b1 { +block0: + v0 = iconst.i64 0x01010101_01010101 + v1 = iconst.i64 0xffffffff_ffffffff + v2 = iconcat v0, v1 + v3 = iconst.i32 66 + v4 = ishl.i128 v2, v3 + v5 = iconst.i64 0x00000000_00000000 + v6 = iconst.i64 0x04040404_04040404 + v7 = iconcat v5, v6 + v8 = icmp eq v4, v7 + return v8 +} +; run + +function %ushr1() -> b1 { +block0: + v0 = iconst.i64 0x01010101_01010101 + v1 = iconst.i64 0x01010101_01010101 + v2 = iconcat v0, v1 + v3 = iconst.i32 2 + v4 = ushr.i128 v2, v3 + v5 = iconst.i64 0x40404040_40404040 + v6 = iconst.i64 0x00404040_40404040 + v7 = iconcat v5, v6 + v8 = icmp eq v4, v7 + return v8 +} +; run + +function %ushr2() -> b1 { +block0: + v0 = iconst.i64 0x01010101_01010101 + v1 = iconst.i64 0x01010101_01010101 + v2 = iconcat v0, v1 + v3 = iconst.i32 66 + v4 = ushr.i128 v2, v3 + v5 = iconst.i64 0x00404040_40404040 + v6 = iconst.i64 0x00000000_00000000 + v7 = iconcat v5, v6 + v8 = icmp eq v4, v7 + return v8 +} +; run + +function %sshr1() -> b1 { +block0: + v0 = iconst.i64 0x01010101_01010101 + v1 = iconst.i64 0x81010101_01010101 + v2 = iconcat v0, v1 + v3 = iconst.i32 2 + v4 = sshr.i128 v2, v3 + v5 = iconst.i64 0x40404040_40404040 + v6 = iconst.i64 0xe0404040_40404040 + v7 = iconcat v5, v6 + v8 = icmp eq v4, v7 + return v8 +} +; run + +function %sshr2() -> b1 { +block0: + v0 = iconst.i64 0x12345678_9abcdef0 + v1 = iconst.i64 0x80101010_10101010 + v2 = iconcat v0, v1 + v3 = iconst.i32 66 + v4 = sshr.i128 v2, v3 + v5 = iconst.i64 0xe0040404_04040404 + v6 = iconst.i64 0xffffffff_ffffffff + v7 = iconcat v5, v6 + v8 = icmp eq v4, v7 + return v8 +} +; run