Avoid unnecessary lane calculations in codegen code

This refactor moves the calculation of the number of lanes to code closer to where the Instruction/BoundInstruction is bound.
2019-07-11 15:59:39 -07:00
parent 3fdc78174f
commit d492cf7e0e
3 changed files with 51 additions and 29 deletions
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1627,23 +1627,24 @@ pub fn define(
    e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
    e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));

+    // SIMD vector size: eventually multiple vector sizes may be supported but for now only SSE-sized vectors are available
+    let sse_vector_size: u64 = 128;
+
    // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
    // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
    // value across the register

    // PSHUFB, 8-bit shuffle using two XMM registers
    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
-        let number_of_lanes = 128 / ty.lane_bits();
-        let instruction = x86_pshufb.bind_vector(ty, number_of_lanes);
-        let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 0x00]);
+        let instruction = x86_pshufb.bind_vector_from_lane(ty, sse_vector_size);
+        let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 00]);
        e.enc32_isap(instruction.clone(), template.clone(), use_ssse3);
        e.enc64_isap(instruction, template, use_ssse3);
    }

    // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
-        let number_of_lanes = 128 / ty.lane_bits();
-        let instruction = x86_pshufd.bind_vector(ty, number_of_lanes);
+        let instruction = x86_pshufd.bind_vector_from_lane(ty, sse_vector_size);
        let template = rec_r_ib_unsigned_fpr
            .nonrex()
            .opcodes(vec![0x66, 0x0f, 0x70]);
@@ -1655,8 +1656,9 @@ pub fn define(
    // to the Intel manual: "When the destination operand is an XMM register, the source operand is
    // written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
-        let number_of_lanes = 128 / ty.lane_bits();
-        let instruction = scalar_to_vector.bind_vector(ty, number_of_lanes).bind(ty);
+        let instruction = scalar_to_vector
+            .bind_vector_from_lane(ty, sse_vector_size)
+            .bind(ty);
        let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
        if ty.lane_bits() < 64 {
            // no 32-bit encodings for 64-bit widths
@@ -1674,8 +1676,7 @@ pub fn define(

    for ty in ValueType::all_lane_types() {
        if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
-            let number_of_lanes = 128 / ty.lane_bits();
-            let instruction = insertlane.bind_vector(ty, number_of_lanes);
+            let instruction = insertlane.bind_vector_from_lane(ty, sse_vector_size);
            let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
            if ty.lane_bits() < 64 {
                e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
@@ -1695,8 +1696,7 @@ pub fn define(

    for ty in ValueType::all_lane_types() {
        if let Some((opcode, isap)) = extractlane_mapping.get(&ty.lane_bits()) {
-            let number_of_lanes = 128 / ty.lane_bits();
-            let instruction = extractlane.bind_vector(ty, number_of_lanes);
+            let instruction = extractlane.bind_vector_from_lane(ty, sse_vector_size);
            let template = rec_r_ib_unsigned_gpr.opcodes(opcode.clone());
            if ty.lane_bits() < 64 {
                e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
@@ -1709,7 +1709,7 @@ pub fn define(

    // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
-        let instruction = bitcast.bind_vector(ty, 16).bind(F64);
+        let instruction = bitcast.bind_vector_from_lane(ty, sse_vector_size).bind(F64);
        e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
        e.enc64_rec(instruction, rec_null_fpr, 0);
    }
@@ -1719,8 +1719,8 @@ pub fn define(
        for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type)
        {
            let instruction = raw_bitcast
-                .bind_vector(to_type, 128 / to_type.lane_bits())
-                .bind_vector(from_type, 128 / from_type.lane_bits());
+                .bind_vector_from_lane(to_type, sse_vector_size)
+                .bind_vector_from_lane(from_type, sse_vector_size);
            e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
            e.enc64_rec(instruction, rec_null_fpr, 0);
        }