Avoid unnecessary lane calculations in codegen code
This refactor moves the calculation of the number of lanes to code closer to where the Instruction/BoundInstruction is bound.
This commit is contained in:
@@ -181,8 +181,17 @@ impl Instruction {
|
|||||||
bind_ref(self.clone(), Some(reference_type.into()), Vec::new())
|
bind_ref(self.clone(), Some(reference_type.into()), Vec::new())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn bind_vector(&self, lane_type: impl Into<LaneType>, num_lanes: u64) -> BoundInstruction {
|
pub fn bind_vector_from_lane(
|
||||||
bind_vector(self.clone(), lane_type.into(), num_lanes, Vec::new())
|
&self,
|
||||||
|
lane_type: impl Into<LaneType>,
|
||||||
|
vector_size_in_bits: u64,
|
||||||
|
) -> BoundInstruction {
|
||||||
|
bind_vector(
|
||||||
|
self.clone(),
|
||||||
|
lane_type.into(),
|
||||||
|
vector_size_in_bits,
|
||||||
|
Vec::new(),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn bind_any(&self) -> BoundInstruction {
|
pub fn bind_any(&self) -> BoundInstruction {
|
||||||
@@ -414,8 +423,17 @@ impl BoundInstruction {
|
|||||||
bind_ref(self.inst, Some(reference_type.into()), self.value_types)
|
bind_ref(self.inst, Some(reference_type.into()), self.value_types)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn bind_vector(self, lane_type: impl Into<LaneType>, num_lanes: u64) -> BoundInstruction {
|
pub fn bind_vector_from_lane(
|
||||||
bind_vector(self.inst, lane_type.into(), num_lanes, self.value_types)
|
self,
|
||||||
|
lane_type: impl Into<LaneType>,
|
||||||
|
vector_size_in_bits: u64,
|
||||||
|
) -> BoundInstruction {
|
||||||
|
bind_vector(
|
||||||
|
self.inst,
|
||||||
|
lane_type.into(),
|
||||||
|
vector_size_in_bits,
|
||||||
|
self.value_types,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn bind_any(self) -> BoundInstruction {
|
pub fn bind_any(self) -> BoundInstruction {
|
||||||
@@ -1116,9 +1134,10 @@ fn bind_ref(
|
|||||||
fn bind_vector(
|
fn bind_vector(
|
||||||
inst: Instruction,
|
inst: Instruction,
|
||||||
lane_type: LaneType,
|
lane_type: LaneType,
|
||||||
num_lanes: u64,
|
vector_size_in_bits: u64,
|
||||||
mut value_types: Vec<ValueTypeOrAny>,
|
mut value_types: Vec<ValueTypeOrAny>,
|
||||||
) -> BoundInstruction {
|
) -> BoundInstruction {
|
||||||
|
let num_lanes = vector_size_in_bits / lane_type.lane_bits();
|
||||||
let vector_type = ValueType::Vector(VectorType::new(lane_type, num_lanes));
|
let vector_type = ValueType::Vector(VectorType::new(lane_type, num_lanes));
|
||||||
value_types.push(ValueTypeOrAny::ValueType(vector_type));
|
value_types.push(ValueTypeOrAny::ValueType(vector_type));
|
||||||
verify_polymorphic_binding(&inst, &value_types);
|
verify_polymorphic_binding(&inst, &value_types);
|
||||||
|
|||||||
@@ -1627,23 +1627,24 @@ pub fn define(
|
|||||||
e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
|
e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
|
||||||
e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));
|
e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));
|
||||||
|
|
||||||
|
// SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
|
||||||
|
let sse_vector_size: u64 = 128;
|
||||||
|
|
||||||
// SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
|
// SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
|
||||||
// legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
|
// legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
|
||||||
// value across the register
|
// value across the register
|
||||||
|
|
||||||
// PSHUFB, 8-bit shuffle using two XMM registers
|
// PSHUFB, 8-bit shuffle using two XMM registers
|
||||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
|
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
|
||||||
let number_of_lanes = 128 / ty.lane_bits();
|
let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
let instruction = x86_pshufb.bind_vector(ty, number_of_lanes);
|
let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
|
||||||
let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 0x00]);
|
|
||||||
e.enc32_isap(instruction.clone(), template.clone(), use_ssse3);
|
e.enc32_isap(instruction.clone(), template.clone(), use_ssse3);
|
||||||
e.enc64_isap(instruction, template, use_ssse3);
|
e.enc64_isap(instruction, template, use_ssse3);
|
||||||
}
|
}
|
||||||
|
|
||||||
// PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
|
// PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
|
||||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
|
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
|
||||||
let number_of_lanes = 128 / ty.lane_bits();
|
let instruction = x86_pshufd.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
let instruction = x86_pshufd.bind_vector(ty, number_of_lanes);
|
|
||||||
let template = rec_r_ib_unsigned_fpr
|
let template = rec_r_ib_unsigned_fpr
|
||||||
.nonrex()
|
.nonrex()
|
||||||
.opcodes(vec![0x66, 0x0f, 0x70]);
|
.opcodes(vec![0x66, 0x0f, 0x70]);
|
||||||
@@ -1655,8 +1656,9 @@ pub fn define(
|
|||||||
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
|
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
|
||||||
// written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
|
// written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
|
||||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
|
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
|
||||||
let number_of_lanes = 128 / ty.lane_bits();
|
let instruction = scalar_to_vector
|
||||||
let instruction = scalar_to_vector.bind_vector(ty, number_of_lanes).bind(ty);
|
.bind_vector_from_lane(ty, sse_vector_size)
|
||||||
|
.bind(ty);
|
||||||
let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
|
let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
|
||||||
if ty.lane_bits() < 64 {
|
if ty.lane_bits() < 64 {
|
||||||
// no 32-bit encodings for 64-bit widths
|
// no 32-bit encodings for 64-bit widths
|
||||||
@@ -1674,8 +1676,7 @@ pub fn define(
|
|||||||
|
|
||||||
for ty in ValueType::all_lane_types() {
|
for ty in ValueType::all_lane_types() {
|
||||||
if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
|
if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
|
||||||
let number_of_lanes = 128 / ty.lane_bits();
|
let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
let instruction = insertlane.bind_vector(ty, number_of_lanes);
|
|
||||||
let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
|
let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
|
||||||
if ty.lane_bits() < 64 {
|
if ty.lane_bits() < 64 {
|
||||||
e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
|
e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
|
||||||
@@ -1695,8 +1696,7 @@ pub fn define(
|
|||||||
|
|
||||||
for ty in ValueType::all_lane_types() {
|
for ty in ValueType::all_lane_types() {
|
||||||
if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
|
if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
|
||||||
let number_of_lanes = 128 / ty.lane_bits();
|
let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
let instruction = extractlane.bind_vector(ty, number_of_lanes);
|
|
||||||
let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
|
let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
|
||||||
if ty.lane_bits() < 64 {
|
if ty.lane_bits() < 64 {
|
||||||
e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
|
e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
|
||||||
@@ -1709,7 +1709,7 @@ pub fn define(
|
|||||||
|
|
||||||
// SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
|
// SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
|
||||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
|
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
|
||||||
let instruction = bitcast.bind_vector(ty, 16).bind(F64);
|
let instruction = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
|
||||||
e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
|
e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
|
||||||
e.enc64_rec(instruction, rec_null_fpr, 0);
|
e.enc64_rec(instruction, rec_null_fpr, 0);
|
||||||
}
|
}
|
||||||
@@ -1719,8 +1719,8 @@ pub fn define(
|
|||||||
for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type)
|
for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type)
|
||||||
{
|
{
|
||||||
let instruction = raw_bitcast
|
let instruction = raw_bitcast
|
||||||
.bind_vector(to_type, 128 / to_type.lane_bits())
|
.bind_vector_from_lane(to_type, sse_vector_size)
|
||||||
.bind_vector(from_type, 128 / from_type.lane_bits());
|
.bind_vector_from_lane(from_type, sse_vector_size);
|
||||||
e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
|
e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
|
||||||
e.enc64_rec(instruction, rec_null_fpr, 0);
|
e.enc64_rec(instruction, rec_null_fpr, 0);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -320,12 +320,15 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
|
|||||||
let c = var("c");
|
let c = var("c");
|
||||||
let d = var("d");
|
let d = var("d");
|
||||||
|
|
||||||
|
// SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
|
||||||
|
let sse_vector_size: u64 = 128;
|
||||||
|
|
||||||
// SIMD splat: 8-bits
|
// SIMD splat: 8-bits
|
||||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
|
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
|
||||||
let splat_x8x16 = splat.bind_vector(ty, 128 / ty.lane_bits());
|
let splat_any8x16 = splat.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
let bitcast_f64_to_any8x16 = bitcast.bind_vector(ty, 128 / ty.lane_bits()).bind(F64);
|
let bitcast_f64_to_any8x16 = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
|
||||||
narrow.legalize(
|
narrow.legalize(
|
||||||
def!(y = splat_x8x16(x)),
|
def!(y = splat_any8x16(x)),
|
||||||
vec![
|
vec![
|
||||||
def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register
|
def!(a = scalar_to_vector(x)), // move into the lowest 8 bits of an XMM register
|
||||||
def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0
|
def!(b = f64const(ieee64_zero)), // zero out a different XMM register; the shuffle mask for moving the lowest byte to all other byte lanes is 0x0
|
||||||
@@ -337,13 +340,13 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
|
|||||||
|
|
||||||
// SIMD splat: 16-bits
|
// SIMD splat: 16-bits
|
||||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
|
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
|
||||||
let splat_x16x8 = splat.bind_vector(ty, 128 / ty.lane_bits());
|
let splat_x16x8 = splat.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
|
let raw_bitcast_any16x8_to_i32x4 = raw_bitcast
|
||||||
.bind_vector(I32, 4)
|
.bind_vector_from_lane(I32, sse_vector_size)
|
||||||
.bind_vector(ty, 128 / ty.lane_bits());
|
.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
|
let raw_bitcast_i32x4_to_any16x8 = raw_bitcast
|
||||||
.bind_vector(ty, 128 / ty.lane_bits())
|
.bind_vector_from_lane(ty, sse_vector_size)
|
||||||
.bind_vector(I32, 4);
|
.bind_vector_from_lane(I32, sse_vector_size);
|
||||||
narrow.legalize(
|
narrow.legalize(
|
||||||
def!(y = splat_x16x8(x)),
|
def!(y = splat_x16x8(x)),
|
||||||
vec![
|
vec![
|
||||||
@@ -358,7 +361,7 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
|
|||||||
|
|
||||||
// SIMD splat: 32-bits
|
// SIMD splat: 32-bits
|
||||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
|
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
|
||||||
let splat_any32x4 = splat.bind_vector(ty, 128 / ty.lane_bits());
|
let splat_any32x4 = splat.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
narrow.legalize(
|
narrow.legalize(
|
||||||
def!(y = splat_any32x4(x)),
|
def!(y = splat_any32x4(x)),
|
||||||
vec![
|
vec![
|
||||||
@@ -370,7 +373,7 @@ pub fn define(shared: &mut SharedDefinitions, x86_instructions: &InstructionGrou
|
|||||||
|
|
||||||
// SIMD splat: 64-bits
|
// SIMD splat: 64-bits
|
||||||
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
|
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 64) {
|
||||||
let splat_any64x2 = splat.bind_vector(ty, 128 / ty.lane_bits());
|
let splat_any64x2 = splat.bind_vector_from_lane(ty, sse_vector_size);
|
||||||
narrow.legalize(
|
narrow.legalize(
|
||||||
def!(y = splat_any64x2(x)),
|
def!(y = splat_any64x2(x)),
|
||||||
vec![
|
vec![
|
||||||
|
|||||||
Reference in New Issue
Block a user