x64: Lower stack_addr, udiv, sdiv, urem, srem, umulhi, smulhi in ISLE (#4741)

Lower stack_addr, udiv, sdiv, urem, srem, umulhi, and smulhi in ISLE. For udiv, sdiv, urem, and srem I opted to move the original lowering into an extern constructor, as the interactions with rax and rdx for the div instruction didn't seem meaningful to implement in ISLE. However, I'm happy to revisit this choice and move more of the embedding into ISLE.
2022-08-23 11:22:49 -07:00
parent 3b68d76905
commit b5f1ab7780
13 changed files with 585 additions and 159 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -2738,7 +2738,7 @@
 (rule (mul_hi ty signed src1 src2)
      (let ((dst_lo WritableGpr (temp_writable_gpr))
            (dst_hi WritableGpr (temp_writable_gpr))
-            (size OperandSize (operand_size_of_type_32_64 ty))
+            (size OperandSize (raw_operand_size_of_type ty))
            (_ Unit (emit (MInst.MulHi size
                                       signed
                                       src1
@@ -3587,6 +3587,25 @@
 (rule (bitcast_gpr_to_xmm $I64 src)
      (gpr_to_xmm (SseOpcode.Movq) src (OperandSize.Size64)))

+;;;; Stack Addresses ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(decl stack_addr_impl (StackSlot Offset32) Gpr)
+(rule (stack_addr_impl stack_slot offset)
+      (let ((dst WritableGpr (temp_writable_gpr))
+           (_ Unit (emit (abi_stackslot_addr dst stack_slot offset))))
+        dst))
+
+;;;; Division/Remainders ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(decl emit_div_or_rem (DivOrRemKind Type WritableGpr Gpr Gpr) Unit)
+(extern constructor emit_div_or_rem emit_div_or_rem)
+
+(decl div_or_rem (DivOrRemKind Value Value) Gpr)
+(rule (div_or_rem kind a @ (value_type ty) b)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (_ Unit (emit_div_or_rem kind ty dst a b)))
+        dst))
+
 ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (convert Gpr InstOutput output_gpr)
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1575,7 +1575,7 @@ impl fmt::Display for ShiftKind {
 }

 /// What kind of division or remainer instruction this is?
-#[derive(Clone)]
+#[derive(Clone, Eq, PartialEq)]
 pub enum DivOrRemKind {
    SignedDiv,
    UnsignedDiv,
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -49,6 +49,23 @@ impl Inst {
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
+
+    fn mul_hi(size: OperandSize, signed: bool, rhs: RegMem) -> Inst {
+        debug_assert!(size.is_one_of(&[
+            OperandSize::Size16,
+            OperandSize::Size32,
+            OperandSize::Size64
+        ]));
+        rhs.assert_regclass_is(RegClass::Int);
+        Inst::MulHi {
+            size,
+            signed,
+            src1: Gpr::new(regs::rax()).unwrap(),
+            src2: GprMem::new(rhs).unwrap(),
+            dst_lo: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        }
+    }
 }

 #[test]
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -208,23 +208,6 @@ impl Inst {
        }
    }

-    pub(crate) fn mul_hi(size: OperandSize, signed: bool, rhs: RegMem) -> Inst {
-        debug_assert!(size.is_one_of(&[
-            OperandSize::Size16,
-            OperandSize::Size32,
-            OperandSize::Size64
-        ]));
-        rhs.assert_regclass_is(RegClass::Int);
-        Inst::MulHi {
-            size,
-            signed,
-            src1: Gpr::new(regs::rax()).unwrap(),
-            src2: GprMem::new(rhs).unwrap(),
-            dst_lo: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
-            dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
-        }
-    }
-
    pub(crate) fn checked_div_or_rem_seq(
        kind: DivOrRemKind,
        size: OperandSize,
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3426,3 +3426,62 @@

 (rule (lower (has_type (use_sse41) (trunc a @ (value_type $F64X2))))
      (x64_roundpd a (RoundImm.RoundZero)))
+
+;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (stack_addr stack_slot offset))
+      (stack_addr_impl stack_slot offset))
+
+;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (udiv a @ (value_type ty) b))
+      (div_or_rem (DivOrRemKind.UnsignedDiv) a b))
+
+;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (sdiv a @ (value_type ty) b))
+      (div_or_rem (DivOrRemKind.SignedDiv) a b))
+
+;; Rules for `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (urem a @ (value_type ty) b))
+      (div_or_rem (DivOrRemKind.UnsignedRem) a b))
+
+;; Rules for `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (srem a @ (value_type ty) b))
+      (div_or_rem (DivOrRemKind.SignedRem) a b))
+
+;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (umulhi a @ (value_type $I16) b))
+      (let ((res ValueRegs (mul_hi $I16 $false a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+(rule (lower (umulhi a @ (value_type $I32) b))
+      (let ((res ValueRegs (mul_hi $I32 $false a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+(rule (lower (umulhi a @ (value_type $I64) b))
+      (let ((res ValueRegs (mul_hi $I64 $false a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (smulhi a @ (value_type $I16) b))
+      (let ((res ValueRegs (mul_hi $I16 $true a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+(rule (lower (smulhi a @ (value_type $I32) b))
+      (let ((res ValueRegs (mul_hi $I32 $true a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+(rule (lower (smulhi a @ (value_type $I64) b))
+      (let ((res ValueRegs (mul_hi $I64 $true a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -14,7 +14,6 @@ use crate::machinst::*;
 use crate::result::CodegenResult;
 use crate::settings::{Flags, TlsModel};
 use smallvec::SmallVec;
-use std::convert::TryFrom;
 use target_lexicon::Triple;

 //=============================================================================
@@ -574,150 +573,19 @@ fn lower_insn_to_regs(
        | Opcode::Ceil
        | Opcode::Floor
        | Opcode::Nearest
-        | Opcode::Trunc => {
+        | Opcode::Trunc
+        | Opcode::StackAddr
+        | Opcode::Udiv
+        | Opcode::Urem
+        | Opcode::Sdiv
+        | Opcode::Srem
+        | Opcode::Umulhi
+        | Opcode::Smulhi => {
            implemented_in_isle(ctx);
        }

        Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),

-        Opcode::StackAddr => {
-            let (stack_slot, offset) = match *ctx.data(insn) {
-                InstructionData::StackLoad {
-                    opcode: Opcode::StackAddr,
-                    stack_slot,
-                    offset,
-                } => (stack_slot, offset),
-                _ => unreachable!(),
-            };
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let offset: i32 = offset.into();
-            let inst =
-                ctx.abi()
-                    .sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
-            ctx.emit(inst);
-        }
-
-        Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => {
-            let kind = match op {
-                Opcode::Udiv => DivOrRemKind::UnsignedDiv,
-                Opcode::Sdiv => DivOrRemKind::SignedDiv,
-                Opcode::Urem => DivOrRemKind::UnsignedRem,
-                Opcode::Srem => DivOrRemKind::SignedRem,
-                _ => unreachable!(),
-            };
-            let is_div = kind.is_div();
-
-            let input_ty = ctx.input_ty(insn, 0);
-            let size = OperandSize::from_ty(input_ty);
-
-            let dividend = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(regs::rax()),
-                dividend,
-                input_ty,
-            ));
-
-            // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
-            if flags.avoid_div_traps() || op == Opcode::Srem {
-                // A vcode meta-instruction is used to lower the inline checks, since they embed
-                // pc-relative offsets that must not change, thus requiring regalloc to not
-                // interfere by introducing spills and reloads.
-                //
-                // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
-                // regalloc is aware of the coalescing opportunity between rax/rdx and the
-                // destination register.
-                let divisor = put_input_in_reg(ctx, inputs[1]);
-
-                let divisor_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64));
-
-                let tmp = if op == Opcode::Sdiv && size == OperandSize::Size64 {
-                    Some(ctx.alloc_tmp(types::I64).only_reg().unwrap())
-                } else {
-                    None
-                };
-                // TODO use xor
-                ctx.emit(Inst::imm(
-                    OperandSize::Size32,
-                    0,
-                    Writable::from_reg(regs::rdx()),
-                ));
-                ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp));
-            } else {
-                // We don't want more than one trap record for a single instruction,
-                // so let's not allow the "mem" case (load-op merging) here; force
-                // divisor into a register instead.
-                let divisor = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
-
-                // Fill in the high parts:
-                if kind.is_signed() {
-                    // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
-                    // signed opcodes.
-                    ctx.emit(Inst::sign_extend_data(size));
-                } else if input_ty == types::I8 {
-                    ctx.emit(Inst::movzx_rm_r(
-                        ExtMode::BL,
-                        RegMem::reg(regs::rax()),
-                        Writable::from_reg(regs::rax()),
-                    ));
-                } else {
-                    // zero for unsigned opcodes.
-                    ctx.emit(Inst::imm(
-                        OperandSize::Size64,
-                        0,
-                        Writable::from_reg(regs::rdx()),
-                    ));
-                }
-
-                // Emit the actual idiv.
-                ctx.emit(Inst::div(size, kind.is_signed(), divisor));
-            }
-
-            // Move the result back into the destination reg.
-            if is_div {
-                // The quotient is in rax.
-                ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
-            } else {
-                if size == OperandSize::Size8 {
-                    // The remainder is in AH. Right-shift by 8 bits then move from rax.
-                    ctx.emit(Inst::shift_r(
-                        OperandSize::Size64,
-                        ShiftKind::ShiftRightLogical,
-                        Some(8),
-                        Writable::from_reg(regs::rax()),
-                    ));
-                    ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
-                } else {
-                    // The remainder is in rdx.
-                    ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
-                }
-            }
-        }
-
-        Opcode::Umulhi | Opcode::Smulhi => {
-            let input_ty = ctx.input_ty(insn, 0);
-
-            let lhs = put_input_in_reg(ctx, inputs[0]);
-            let rhs = input_to_reg_mem(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            // Move lhs in %rax.
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(regs::rax()),
-                lhs,
-                input_ty,
-            ));
-
-            // Emit the actual mul or imul.
-            let signed = op == Opcode::Smulhi;
-            ctx.emit(Inst::mul_hi(OperandSize::from_ty(input_ty), signed, rhs));
-
-            // Read the result from the high part (stored in %rdx).
-            ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
-        }
-
        Opcode::GetPinnedReg => {
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64));
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -848,6 +848,108 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
        self.lower_ctx
            .use_constant(VCodeConstantData::WellKnown(&UMAX_MASK))
    }
+
+    fn emit_div_or_rem(
+        &mut self,
+        kind: &DivOrRemKind,
+        ty: Type,
+        dst: WritableGpr,
+        dividend: Gpr,
+        divisor: Gpr,
+    ) {
+        let is_div = kind.is_div();
+        let size = OperandSize::from_ty(ty);
+
+        self.lower_ctx.emit(MInst::gen_move(
+            Writable::from_reg(regs::rax()),
+            dividend.to_reg(),
+            ty,
+        ));
+
+        // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
+        if self.flags.avoid_div_traps() || *kind == DivOrRemKind::SignedRem {
+            // A vcode meta-instruction is used to lower the inline checks, since they embed
+            // pc-relative offsets that must not change, thus requiring regalloc to not
+            // interfere by introducing spills and reloads.
+            //
+            // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
+            // regalloc is aware of the coalescing opportunity between rax/rdx and the
+            // destination register.
+            let divisor_copy = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+            self.lower_ctx
+                .emit(MInst::gen_move(divisor_copy, divisor.to_reg(), types::I64));
+
+            let tmp = if *kind == DivOrRemKind::SignedDiv && size == OperandSize::Size64 {
+                Some(self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap())
+            } else {
+                None
+            };
+            // TODO use xor
+            self.lower_ctx.emit(MInst::imm(
+                OperandSize::Size32,
+                0,
+                Writable::from_reg(regs::rdx()),
+            ));
+            self.lower_ctx.emit(MInst::checked_div_or_rem_seq(
+                kind.clone(),
+                size,
+                divisor_copy,
+                tmp,
+            ));
+        } else {
+            // We don't want more than one trap record for a single instruction,
+            // so let's not allow the "mem" case (load-op merging) here; force
+            // divisor into a register instead.
+            let divisor = RegMem::reg(divisor.to_reg());
+
+            // Fill in the high parts:
+            if kind.is_signed() {
+                // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
+                // signed opcodes.
+                self.lower_ctx.emit(MInst::sign_extend_data(size));
+            } else if ty == types::I8 {
+                self.lower_ctx.emit(MInst::movzx_rm_r(
+                    ExtMode::BL,
+                    RegMem::reg(regs::rax()),
+                    Writable::from_reg(regs::rax()),
+                ));
+            } else {
+                // zero for unsigned opcodes.
+                self.lower_ctx.emit(MInst::imm(
+                    OperandSize::Size64,
+                    0,
+                    Writable::from_reg(regs::rdx()),
+                ));
+            }
+
+            // Emit the actual idiv.
+            self.lower_ctx
+                .emit(MInst::div(size, kind.is_signed(), divisor));
+        }
+
+        // Move the result back into the destination reg.
+        if is_div {
+            // The quotient is in rax.
+            self.lower_ctx
+                .emit(MInst::gen_move(dst.to_writable_reg(), regs::rax(), ty));
+        } else {
+            if size == OperandSize::Size8 {
+                // The remainder is in AH. Right-shift by 8 bits then move from rax.
+                self.lower_ctx.emit(MInst::shift_r(
+                    OperandSize::Size64,
+                    ShiftKind::ShiftRightLogical,
+                    Some(8),
+                    Writable::from_reg(regs::rax()),
+                ));
+                self.lower_ctx
+                    .emit(MInst::gen_move(dst.to_writable_reg(), regs::rax(), ty));
+            } else {
+                // The remainder is in rdx.
+                self.lower_ctx
+                    .emit(MInst::gen_move(dst.to_writable_reg(), regs::rdx(), ty));
+            }
+        }
+    }
 }

 impl IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
--- a/cranelift/filetests/filetests/isa/x64/sdiv.clif
+++ b/cranelift/filetests/filetests/isa/x64/sdiv.clif
@@ -0,0 +1,67 @@
+test compile precise-output
+target x86_64
+
+function %f1(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = sdiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   cbw %al, %dl
+;   idiv    %al, (none), %sil, %al, %dl
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = sdiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   cwd %ax, %dx
+;   idiv    %ax, %dx, %si, %ax, %dx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = sdiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   cdq %eax, %edx
+;   idiv    %eax, %edx, %esi, %eax, %edx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = sdiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   cqo %rax, %rdx
+;   idiv    %rax, %rdx, %rsi, %rax, %rdx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
--- a/cranelift/filetests/filetests/isa/x64/smulhi.clif
+++ b/cranelift/filetests/filetests/isa/x64/smulhi.clif
@@ -0,0 +1,51 @@
+test compile precise-output
+target x86_64
+
+function %f1(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = smulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imul    %ax, %si, %ax, %dx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = smulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imul    %eax, %esi, %eax, %edx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = smulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imul    %rax, %rsi, %rax, %rdx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
--- a/cranelift/filetests/filetests/isa/x64/srem.clif
+++ b/cranelift/filetests/filetests/isa/x64/srem.clif
@@ -0,0 +1,71 @@
+test compile precise-output
+target x86_64
+
+function %f1(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = srem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   srem_seq %al, %dl, %sil, %al, %dl, tmp=(none)
+;   shrq    $8, %rax, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = srem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   srem_seq %ax, %dx, %si, %ax, %dx, tmp=(none)
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = srem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   srem_seq %eax, %edx, %esi, %eax, %edx, tmp=(none)
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = srem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   srem_seq %rax, %rdx, %rsi, %rax, %rdx, tmp=(none)
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
--- a/cranelift/filetests/filetests/isa/x64/udiv.clif
+++ b/cranelift/filetests/filetests/isa/x64/udiv.clif
@@ -0,0 +1,67 @@
+test compile precise-output
+target x86_64
+
+function %f1(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = udiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movzbl  %al, %eax
+;   div     %al, (none), %sil, %al, %dl
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = udiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %ax, %dx, %si, %ax, %dx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = udiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %eax, %edx, %esi, %eax, %edx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = udiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %rax, %rdx, %rsi, %rax, %rdx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
--- a/cranelift/filetests/filetests/isa/x64/umulhi.clif
+++ b/cranelift/filetests/filetests/isa/x64/umulhi.clif
@@ -0,0 +1,51 @@
+test compile precise-output
+target x86_64
+
+function %f1(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = umulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   mul     %ax, %si, %ax, %dx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = umulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   mul     %eax, %esi, %eax, %edx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = umulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   mul     %rax, %rsi, %rax, %rdx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
--- a/cranelift/filetests/filetests/isa/x64/urem.clif
+++ b/cranelift/filetests/filetests/isa/x64/urem.clif
@@ -0,0 +1,71 @@
+test compile precise-output
+target x86_64
+
+function %f1(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = urem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movzbl  %al, %eax
+;   div     %al, (none), %sil, %al, %dl
+;   shrq    $8, %rax, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = urem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %ax, %dx, %si, %ax, %dx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = urem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %eax, %edx, %esi, %eax, %edx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = urem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %rax, %rdx, %rsi, %rax, %rdx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+