diff --git a/cranelift/codegen/meta/src/cdsl/instructions.rs b/cranelift/codegen/meta/src/cdsl/instructions.rs index b91d9f26cf..f061be94ed 100644 --- a/cranelift/codegen/meta/src/cdsl/instructions.rs +++ b/cranelift/codegen/meta/src/cdsl/instructions.rs @@ -1127,6 +1127,11 @@ fn bind_vector( mut value_types: Vec, ) -> BoundInstruction { let num_lanes = vector_size_in_bits / lane_type.lane_bits(); + assert!( + num_lanes >= 2, + "Minimum lane number for bind_vector is 2, found {}.", + num_lanes, + ); let vector_type = ValueType::Vector(VectorType::new(lane_type, num_lanes)); value_types.push(ValueTypeOrAny::ValueType(vector_type)); verify_polymorphic_binding(&inst, &value_types); diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 71f1042100..1739ed3ab7 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -9,7 +9,7 @@ use crate::cdsl::instructions::{ }; use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes}; use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber}; -use crate::cdsl::types::ValueType; +use crate::cdsl::types::{LaneType, ValueType}; use crate::shared::types::Bool::{B1, B16, B32, B64, B8}; use crate::shared::types::Float::{F32, F64}; use crate::shared::types::Int::{I16, I32, I64, I8}; @@ -1735,6 +1735,8 @@ pub(crate) fn define( // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the // value across the register + let allowed_simd_type = |t: &LaneType| t.lane_bits() >= 8 && t.lane_bits() < 128; + // PSHUFB, 8-bit shuffle using two XMM registers for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) { let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size); @@ -1756,7 +1758,7 @@ pub(crate) fn define( // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according // to the Intel manual: "When the destination operand is an XMM register, the source operand is // written to the low doubleword of the register and the regiser is zero-extended to 128 bits." - for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) { + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { let instruction = scalar_to_vector.bind_vector_from_lane(ty, sse_vector_size); let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ if ty.lane_bits() < 64 { @@ -1816,8 +1818,9 @@ pub(crate) fn define( } // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8) - for from_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) { - for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type) + for from_type in ValueType::all_lane_types().filter(allowed_simd_type) { + for to_type in + ValueType::all_lane_types().filter(|t| allowed_simd_type(t) && *t != from_type) { let instruction = raw_bitcast .bind_vector_from_lane(to_type, sse_vector_size) @@ -1833,7 +1836,7 @@ pub(crate) fn define( // for that; alternately, constants could be loaded into XMM registers using a sequence like: // MOVQ + MOVHPD + MOVQ + MOVLPD (this allows the constants to be immediates instead of stored // in memory) but some performance measurements are needed. - for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) { + for ty in ValueType::all_lane_types().filter(allowed_simd_type) { let instruction = vconst.bind_vector_from_lane(ty, sse_vector_size); let template = rec_vconst.nonrex().opcodes(vec![0x0f, 0x10]); e.enc_32_64_maybe_isap(instruction, template, None); // from SSE