diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index fd160e3e21..7e9f74475e 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1795,14 +1795,14 @@ fn define_simd( let is_zero_128bit = InstructionPredicate::new_is_all_zeroes(&*formats.unary_const, "constant_handle"); - let template = rec_vconst_optimized.nonrex().opcodes(&PXOR); + let template = rec_vconst_optimized.opcodes(&PXOR).infer_rex(); e.enc_32_64_func(instruction.clone(), template, |builder| { builder.inst_predicate(is_zero_128bit) }); let is_ones_128bit = InstructionPredicate::new_is_all_ones(&*formats.unary_const, "constant_handle"); - let template = rec_vconst_optimized.nonrex().opcodes(&PCMPEQB); + let template = rec_vconst_optimized.opcodes(&PCMPEQB).infer_rex(); e.enc_32_64_func(instruction, template, |builder| { builder.inst_predicate(is_ones_128bit) }); @@ -1816,7 +1816,7 @@ fn define_simd( // in memory) but some performance measurements are needed. for ty in ValueType::all_lane_types().filter(allowed_simd_type) { let instruction = vconst.bind(vector(ty, sse_vector_size)); - let template = rec_vconst.nonrex().opcodes(&MOVUPS_LOAD); + let template = rec_vconst.opcodes(&MOVUPS_LOAD).infer_rex(); e.enc_32_64_maybe_isap(instruction, template, None); // from SSE } @@ -1826,7 +1826,10 @@ fn define_simd( for ty in ValueType::all_lane_types().filter(allowed_simd_type) { // Store let bound_store = store.bind(vector(ty, sse_vector_size)).bind(Any); - e.enc_32_64(bound_store.clone(), rec_fst.opcodes(&MOVUPS_STORE)); + e.enc_32_64( + bound_store.clone(), + rec_fst.opcodes(&MOVUPS_STORE).infer_rex(), + ); e.enc_32_64(bound_store.clone(), rec_fstDisp8.opcodes(&MOVUPS_STORE)); e.enc_32_64(bound_store, rec_fstDisp32.opcodes(&MOVUPS_STORE)); diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index c512e98467..c9794a02da 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -46,6 +46,16 @@ impl<'builder> RecipeGroup<'builder> { self.templates.push(template.clone()); template } + fn add_template_inferred( + &mut self, + recipe: EncodingRecipeBuilder, + infer_function: &'static str, + ) -> Rc> { + let template = + Rc::new(Template::new(recipe, self.regs).inferred_rex_compute_size(infer_function)); + self.templates.push(template.clone()); + template + } fn add_template(&mut self, template: Template<'builder>) -> Rc> { let template = Rc::new(template); self.templates.push(template.clone()); @@ -1481,7 +1491,7 @@ pub(crate) fn define<'shared>( ); // XX /r register-indirect store of FPR with no offset. - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("fst", &formats.store, 1) .operands_in(vec![fpr, gpr]) .inst_predicate(has_no_offset) @@ -1504,6 +1514,7 @@ pub(crate) fn define<'shared>( } "#, ), + "size_plus_maybe_sib_or_offset_inreg1_plus_rex_prefix_for_inreg0_inreg1", ); let has_small_offset = @@ -2515,7 +2526,7 @@ pub(crate) fn define<'shared>( ), ); - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("vconst", &formats.unary_const, 5) .operands_out(vec![fpr]) .clobbers_flags(false) @@ -2526,9 +2537,10 @@ pub(crate) fn define<'shared>( const_disp4(constant_handle, func, sink); "#, ), + "size_with_inferred_rex_for_outreg0", ); - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("vconst_optimized", &formats.unary_const, 1) .operands_out(vec![fpr]) .clobbers_flags(false) @@ -2538,6 +2550,7 @@ pub(crate) fn define<'shared>( modrm_rr(out_reg0, out_reg0, sink); "#, ), + "size_with_inferred_rex_for_outreg0", ); recipes.add_template_recipe( diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index a93be1d658..27f7ed43db 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -123,6 +123,22 @@ fn size_plus_maybe_sib_or_offset_for_inreg_1( sizing.base_size + if needs_sib_or_offset { 1 } else { 0 } } +/// Calculates the size while inferring if the first and second input registers (inreg0, inreg1) +/// require a dynamic REX prefix and if the second input register (inreg1) requires a SIB or offset. +fn size_plus_maybe_sib_or_offset_inreg1_plus_rex_prefix_for_inreg0_inreg1( + sizing: &RecipeSizing, + enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + let needs_rex = (EncodingBits::from(enc.bits()).rex_w() != 0) + || test_input(0, inst, divert, func, is_extended_reg) + || test_input(1, inst, divert, func, is_extended_reg); + size_plus_maybe_sib_or_offset_for_inreg_1(sizing, enc, inst, divert, func) + + if needs_rex { 1 } else { 0 } +} + /// Infers whether a dynamic REX prefix will be emitted, for use with one input reg. /// /// A REX prefix is known to be emitted if either: @@ -199,6 +215,19 @@ fn size_with_inferred_rex_for_inreg0_outreg0( sizing.base_size + if needs_rex { 1 } else { 0 } } +/// Infers whether a dynamic REX prefix will be emitted, based on a single output register. +fn size_with_inferred_rex_for_outreg0( + sizing: &RecipeSizing, + enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + let needs_rex = (EncodingBits::from(enc.bits()).rex_w() != 0) + || test_result(0, inst, divert, func, is_extended_reg); + sizing.base_size + if needs_rex { 1 } else { 0 } +} + /// Infers whether a dynamic REX prefix will be emitted, for use with CMOV. /// /// CMOV uses 3 inputs, with the REX is inferred from reg1 and reg2. diff --git a/cranelift/filetests/filetests/isa/x86/binary64.clif b/cranelift/filetests/filetests/isa/x86/binary64.clif index 41290d1462..c241d33769 100644 --- a/cranelift/filetests/filetests/isa/x86/binary64.clif +++ b/cranelift/filetests/filetests/isa/x86/binary64.clif @@ -1679,3 +1679,11 @@ block0: [-, %r10] v0 = bconst.b64 true ; bin: 41 ba 00000001 return } + +function %V128() { +block0: + [-,%r10] v3 = iconst.i64 0x2102_0304_f1f2_f3f4 ; bin: 49 ba 21020304f1f2f3f4 + [-, %xmm9] v4 = vconst.i32x4 [0 1 2 3] ; bin: 44 0f 10 0d 00000005 PCRelRodata4(23) + store v4, v3 ; bin: heap_oob 45 0f 11 0a + return +}