diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index 536a4c7608..dad9d9bba5 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1646,6 +1646,7 @@ fn define_simd(
     let x86_pmins = x86.by_name("x86_pmins");
     let x86_pminu = x86.by_name("x86_pminu");
     let x86_pmullq = x86.by_name("x86_pmullq");
+    let x86_pmuludq = x86.by_name("x86_pmuludq");
     let x86_pshufb = x86.by_name("x86_pshufb");
     let x86_pshufd = x86.by_name("x86_pshufd");
     let x86_psll = x86.by_name("x86_psll");
@@ -2100,6 +2101,9 @@ fn define_simd(
         e.enc_both_inferred_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap);
     }
 
+    // SIMD multiplication with lane expansion.
+    e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ));
+
     // SIMD integer multiplication for I64x2 using a AVX512.
     {
         e.enc_32_64_maybe_isap(
diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs
index 3e258713a0..53d91ca861 100644
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -475,10 +475,11 @@ pub(crate) fn define(
             .includes_scalars(false)
             .build(),
     );
-    let I64x2 = &TypeVar::new(
-        "I64x2",
-        "A SIMD vector type containing one large integer (the upper lane is concatenated with \
-         the lower lane to form the integer)",
+    let I128 = &TypeVar::new(
+        "I128",
+        "A SIMD vector type containing one large integer (due to Cranelift type constraints, \
+        this uses the Cranelift I64X2 type but should be understood as one large value, i.e., the \
+        upper lane is concatenated with the lower lane to form the integer)",
         TypeSetBuilder::new()
             .ints(64..64)
             .simd_lanes(2..2)
@@ -487,7 +488,7 @@ pub(crate) fn define(
     );
 
     let x = &Operand::new("x", IxN).with_doc("Vector value to shift");
-    let y = &Operand::new("y", I64x2).with_doc("Number of bits to shift");
+    let y = &Operand::new("y", I128).with_doc("Number of bits to shift");
     let a = &Operand::new("a", IxN);
 
     ig.push(
@@ -532,6 +533,16 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let I64x2 = &TypeVar::new(
+        "I64x2",
+        "A SIMD vector type containing two 64-bit integers",
+        TypeSetBuilder::new()
+            .ints(64..64)
+            .simd_lanes(2..2)
+            .includes_scalars(false)
+            .build(),
+    );
+
     let x = &Operand::new("x", I64x2);
     let y = &Operand::new("y", I64x2);
     let a = &Operand::new("a", I64x2);
@@ -549,6 +560,20 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    ig.push(
+        Inst::new(
+            "x86_pmuludq",
+            r#"
+        Multiply Packed Integers -- Using only the bottom 32 bits in each lane, multiply two 64x2
+        unsigned integers and receive a 64x2 result. This instruction avoids the need for handling
+        overflow as in `x86_pmullq`.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
     let x = &Operand::new("x", TxN);
     let y = &Operand::new("y", TxN);
     let f = &Operand::new("f", iflags);
diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
index 74dff216e7..8a65553bcf 100644
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -473,6 +473,10 @@ pub static PMULLD: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];
 /// bits of each product in xmm1 (AVX512VL/DQ). Requires an EVEX encoding.
 pub static PMULLQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x40];
 
+/// Multiply packed unsigned doubleword integers in xmm1 by packed unsigned doubleword integers
+/// in xmm2/m128, and store the quadword results in xmm1 (SSE2).
+pub static PMULUDQ: [u8; 3] = [0x66, 0x0f, 0xf4];
+
 /// Pop top of stack into r{16,32,64}; increment stack pointer.
 pub static POP_REG: [u8; 1] = [0x58];
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 443137f43f..daeb0c33c3 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1912,6 +1912,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::X86Pmins
         | Opcode::X86Pminu
         | Opcode::X86Pmullq
+        | Opcode::X86Pmuludq
         | Opcode::X86Packss
         | Opcode::X86Punpckh
         | Opcode::X86Punpckl
diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif
index e5b5e4b28a..f42ba11a96 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif
@@ -99,3 +99,9 @@ block0(v0: f64x2 [%xmm11], v1: f64x2 [%xmm13]):
 [-, %xmm11]    v8 = sqrt v0          ; bin: 66 45 0f 51 db
     return
 }
+
+function %pmuludq(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2 [%xmm3], v1: i64x2 [%xmm5]):
+[-, %xmm3]    v2 = x86_pmuludq v0, v1      ; bin: 66 0f f4 dd
+    return v2
+}