diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 536a4c7608..dad9d9bba5 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1646,6 +1646,7 @@ fn define_simd( let x86_pmins = x86.by_name("x86_pmins"); let x86_pminu = x86.by_name("x86_pminu"); let x86_pmullq = x86.by_name("x86_pmullq"); + let x86_pmuludq = x86.by_name("x86_pmuludq"); let x86_pshufb = x86.by_name("x86_pshufb"); let x86_pshufd = x86.by_name("x86_pshufd"); let x86_psll = x86.by_name("x86_psll"); @@ -2100,6 +2101,9 @@ fn define_simd( e.enc_both_inferred_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap); } + // SIMD multiplication with lane expansion. + e.enc_both_inferred(x86_pmuludq, rec_fa.opcodes(&PMULUDQ)); + // SIMD integer multiplication for I64x2 using a AVX512. { e.enc_32_64_maybe_isap( diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs index 3e258713a0..53d91ca861 100644 --- a/cranelift/codegen/meta/src/isa/x86/instructions.rs +++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs @@ -475,10 +475,11 @@ pub(crate) fn define( .includes_scalars(false) .build(), ); - let I64x2 = &TypeVar::new( - "I64x2", - "A SIMD vector type containing one large integer (the upper lane is concatenated with \ - the lower lane to form the integer)", + let I128 = &TypeVar::new( + "I128", + "A SIMD vector type containing one large integer (due to Cranelift type constraints, \ + this uses the Cranelift I64X2 type but should be understood as one large value, i.e., the \ + upper lane is concatenated with the lower lane to form the integer)", TypeSetBuilder::new() .ints(64..64) .simd_lanes(2..2) @@ -487,7 +488,7 @@ pub(crate) fn define( ); let x = &Operand::new("x", IxN).with_doc("Vector value to shift"); - let y = &Operand::new("y", I64x2).with_doc("Number of bits to shift"); + let y = &Operand::new("y", I128).with_doc("Number of bits to shift"); let a = &Operand::new("a", IxN); ig.push( @@ -532,6 +533,16 @@ pub(crate) fn define( .operands_out(vec![a]), ); + let I64x2 = &TypeVar::new( + "I64x2", + "A SIMD vector type containing two 64-bit integers", + TypeSetBuilder::new() + .ints(64..64) + .simd_lanes(2..2) + .includes_scalars(false) + .build(), + ); + let x = &Operand::new("x", I64x2); let y = &Operand::new("y", I64x2); let a = &Operand::new("a", I64x2); @@ -549,6 +560,20 @@ pub(crate) fn define( .operands_out(vec![a]), ); + ig.push( + Inst::new( + "x86_pmuludq", + r#" + Multiply Packed Integers -- Using only the bottom 32 bits in each lane, multiply two 64x2 + unsigned integers and receive a 64x2 result. This instruction avoids the need for handling + overflow as in `x86_pmullq`. + "#, + &formats.binary, + ) + .operands_in(vec![x, y]) + .operands_out(vec![a]), + ); + let x = &Operand::new("x", TxN); let y = &Operand::new("y", TxN); let f = &Operand::new("f", iflags); diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs index 74dff216e7..8a65553bcf 100644 --- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs +++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs @@ -473,6 +473,10 @@ pub static PMULLD: [u8; 4] = [0x66, 0x0f, 0x38, 0x40]; /// bits of each product in xmm1 (AVX512VL/DQ). Requires an EVEX encoding. pub static PMULLQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x40]; +/// Multiply packed unsigned doubleword integers in xmm1 by packed unsigned doubleword integers +/// in xmm2/m128, and store the quadword results in xmm1 (SSE2). +pub static PMULUDQ: [u8; 3] = [0x66, 0x0f, 0xf4]; + /// Pop top of stack into r{16,32,64}; increment stack pointer. pub static POP_REG: [u8; 1] = [0x58]; diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 443137f43f..daeb0c33c3 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1912,6 +1912,7 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::X86Pmins | Opcode::X86Pminu | Opcode::X86Pmullq + | Opcode::X86Pmuludq | Opcode::X86Packss | Opcode::X86Punpckh | Opcode::X86Punpckl diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif index e5b5e4b28a..f42ba11a96 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif @@ -99,3 +99,9 @@ block0(v0: f64x2 [%xmm11], v1: f64x2 [%xmm13]): [-, %xmm11] v8 = sqrt v0 ; bin: 66 45 0f 51 db return } + +function %pmuludq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2 [%xmm3], v1: i64x2 [%xmm5]): +[-, %xmm3] v2 = x86_pmuludq v0, v1 ; bin: 66 0f f4 dd + return v2 +}