diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index a1d13a70d6..99402fc9d5 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1945,15 +1945,8 @@ fn define_simd( for recipe in &[rec_fld, rec_fldDisp8, rec_fldDisp32] { let inst = *inst; let template = recipe.opcodes(*opcodes); - e.enc32_maybe_isap(inst.clone().bind(I32), template.clone(), isap); - // REX-less encoding must come after REX encoding so we don't use it by - // default. Otherwise reg-alloc would never use r8 and up. - e.enc64_maybe_isap(inst.clone().bind(I32), template.clone().rex(), isap); - e.enc64_maybe_isap(inst.clone().bind(I32), template.clone(), isap); - // Similar to above; TODO some of this duplication can be cleaned up by infer_rex() - // tracked in https://github.com/bytecodealliance/cranelift/issues/1090 - e.enc64_maybe_isap(inst.clone().bind(I64), template.clone().rex(), isap); - e.enc64_maybe_isap(inst.bind(I64), template, isap); + e.enc_both_inferred_maybe_isap(inst.clone().bind(I32), template.clone(), isap); + e.enc64_maybe_isap(inst.bind(I64), template.infer_rex(), isap); } } diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index 85b9068715..eb83eb15ba 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -2087,7 +2087,7 @@ pub(crate) fn define<'shared>( ); // XX /r float load with 8-bit offset. - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("fldDisp8", &formats.load, 2) .operands_in(vec![gpr]) .operands_out(vec![fpr]) @@ -2110,6 +2110,7 @@ pub(crate) fn define<'shared>( sink.put1(offset as u8); "#, ), + "size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0", ); let has_big_offset = @@ -2142,7 +2143,7 @@ pub(crate) fn define<'shared>( ); // XX /r float load with 32-bit offset. - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("fldDisp32", &formats.load, 5) .operands_in(vec![gpr]) .operands_out(vec![fpr]) @@ -2165,6 +2166,7 @@ pub(crate) fn define<'shared>( sink.put4(offset as u32); "#, ), + "size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0", ); } diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index 93e06d2795..e0ec976fdd 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -156,6 +156,22 @@ fn size_plus_maybe_sib_or_offset_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0( + if needs_rex { 1 } else { 0 } } +/// Calculates the size while inferring if the first input register (inreg0) and first output +/// register (outreg0) require a dynamic REX and if the first input register (inreg0) requires a +/// SIB. +fn size_plus_maybe_sib_for_inreg_0_plus_rex_prefix_for_inreg0_outreg0( + sizing: &RecipeSizing, + enc: Encoding, + inst: Inst, + divert: &RegDiversions, + func: &Function, +) -> u8 { + let needs_rex = (EncodingBits::from(enc.bits()).rex_w() != 0) + || test_input(0, inst, divert, func, is_extended_reg) + || test_result(0, inst, divert, func, is_extended_reg); + size_plus_maybe_sib_for_inreg_0(sizing, enc, inst, divert, func) + if needs_rex { 1 } else { 0 } +} + /// Infers whether a dynamic REX prefix will be emitted, for use with one input reg. /// /// A REX prefix is known to be emitted if either: diff --git a/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif index b164aac343..a5d649125f 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif @@ -40,29 +40,29 @@ block0: function %uload_extend() { block0: [-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef - [-,%xmm2] v3 = uload8x8 v1+0 ; bin: heap_oob 66 40 0f 38 30 12 - [-,%xmm2] v4 = uload8x8 v1+20 ; bin: heap_oob 66 40 0f 38 30 52 14 - [-,%xmm2] v5 = uload8x8 v1+256 ; bin: heap_oob 66 40 0f 38 30 92 00000100 - [-,%xmm2] v6 = uload16x4 v1+0 ; bin: heap_oob 66 40 0f 38 33 12 - [-,%xmm2] v7 = uload16x4 v1+20 ; bin: heap_oob 66 40 0f 38 33 52 14 - [-,%xmm2] v8 = uload16x4 v1+256 ; bin: heap_oob 66 40 0f 38 33 92 00000100 - [-,%xmm2] v9 = uload32x2 v1+0 ; bin: heap_oob 66 40 0f 38 35 12 - [-,%xmm2] v10 = uload32x2 v1+20 ; bin: heap_oob 66 40 0f 38 35 52 14 - [-,%xmm2] v11 = uload32x2 v1+256 ; bin: heap_oob 66 40 0f 38 35 92 00000100 + [-,%xmm2] v3 = uload8x8 v1+0 ; bin: heap_oob 66 0f 38 30 12 + [-,%xmm2] v4 = uload8x8 v1+20 ; bin: heap_oob 66 0f 38 30 52 14 + [-,%xmm2] v5 = uload8x8 v1+256 ; bin: heap_oob 66 0f 38 30 92 00000100 + [-,%xmm2] v6 = uload16x4 v1+0 ; bin: heap_oob 66 0f 38 33 12 + [-,%xmm2] v7 = uload16x4 v1+20 ; bin: heap_oob 66 0f 38 33 52 14 + [-,%xmm2] v8 = uload16x4 v1+256 ; bin: heap_oob 66 0f 38 33 92 00000100 + [-,%xmm10] v9 = uload32x2 v1+0 ; bin: heap_oob 66 44 0f 38 35 12 + [-,%xmm10] v10 = uload32x2 v1+20 ; bin: heap_oob 66 44 0f 38 35 52 14 + [-,%xmm10] v11 = uload32x2 v1+256 ; bin: heap_oob 66 44 0f 38 35 92 00000100 return } function %sload_extend() { block0: [-,%rdx] v1 = iconst.i64 0x0123_4567_89ab_cdef - [-,%xmm2] v3 = sload8x8 v1+0 ; bin: heap_oob 66 40 0f 38 20 12 - [-,%xmm2] v4 = sload8x8 v1+20 ; bin: heap_oob 66 40 0f 38 20 52 14 - [-,%xmm2] v5 = sload8x8 v1+256 ; bin: heap_oob 66 40 0f 38 20 92 00000100 - [-,%xmm2] v6 = sload16x4 v1+0 ; bin: heap_oob 66 40 0f 38 23 12 - [-,%xmm2] v7 = sload16x4 v1+20 ; bin: heap_oob 66 40 0f 38 23 52 14 - [-,%xmm2] v8 = sload16x4 v1+256 ; bin: heap_oob 66 40 0f 38 23 92 00000100 - [-,%xmm2] v9 = sload32x2 v1+0 ; bin: heap_oob 66 40 0f 38 25 12 - [-,%xmm2] v10 = sload32x2 v1+20 ; bin: heap_oob 66 40 0f 38 25 52 14 - [-,%xmm2] v11 = sload32x2 v1+256 ; bin: heap_oob 66 40 0f 38 25 92 00000100 + [-,%xmm2] v3 = sload8x8 v1+0 ; bin: heap_oob 66 0f 38 20 12 + [-,%xmm2] v4 = sload8x8 v1+20 ; bin: heap_oob 66 0f 38 20 52 14 + [-,%xmm2] v5 = sload8x8 v1+256 ; bin: heap_oob 66 0f 38 20 92 00000100 + [-,%xmm10] v6 = sload16x4 v1+0 ; bin: heap_oob 66 44 0f 38 23 12 + [-,%xmm10] v7 = sload16x4 v1+20 ; bin: heap_oob 66 44 0f 38 23 52 14 + [-,%xmm10] v8 = sload16x4 v1+256 ; bin: heap_oob 66 44 0f 38 23 92 00000100 + [-,%xmm2] v9 = sload32x2 v1+0 ; bin: heap_oob 66 0f 38 25 12 + [-,%xmm2] v10 = sload32x2 v1+20 ; bin: heap_oob 66 0f 38 25 52 14 + [-,%xmm2] v11 = sload32x2 v1+256 ; bin: heap_oob 66 0f 38 25 92 00000100 return }