From 0d63bd12d8bbc433b1a366bfa9e62a46622313ac Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Mon, 16 Mar 2020 18:22:21 -0700 Subject: [PATCH] Infer REX prefix for SIMD operations; fixes #1127 - Convert recipes to have necessary size calculator - Add a missing binemit function, `put_dynrexmp3` - Modify the meta-encodings of x86 SIMD instructions to use `infer_rex()`, mostly through the `enc_both_inferred()` helper - Fix up tests that previously always emitted a REX prefix --- .../codegen/meta/src/isa/x86/encodings.rs | 167 +++++++++--------- cranelift/codegen/meta/src/isa/x86/recipes.rs | 125 +++++++------ cranelift/codegen/src/isa/x86/binemit.rs | 21 ++- .../isa/x86/simd-arithmetic-binemit.clif | 52 ++++-- .../isa/x86/simd-comparison-binemit.clif | 58 ++++-- .../isa/x86/simd-lane-access-binemit.clif | 8 +- .../isa/x86/simd-memory-binemit.clif | 11 +- 7 files changed, 257 insertions(+), 185 deletions(-) diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 7397dc5e35..8de772c175 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -312,6 +312,23 @@ impl PerCpuModeEncodings { self.enc_x86_64_instp(inst, template, instp); } + /// Add two encodings for `inst`: + /// - X86_32, dynamically infer the REX prefix. + /// - X86_64, dynamically infer the REX prefix. + fn enc_both_inferred(&mut self, inst: impl Clone + Into, template: Template) { + self.enc32(inst.clone(), template.infer_rex()); + self.enc64(inst, template.infer_rex()); + } + fn enc_both_inferred_maybe_isap( + &mut self, + inst: impl Clone + Into, + template: Template, + isap: Option, + ) { + self.enc32_maybe_isap(inst.clone(), template.infer_rex(), isap); + self.enc64_maybe_isap(inst, template.infer_rex(), isap); + } + /// Add two encodings for `inst`: /// - X86_32 /// - X86_64 with the REX prefix. @@ -340,12 +357,6 @@ impl PerCpuModeEncodings { } } - /// Add the same encoding/template pairing to both X86_32 and X86_64 - fn enc_32_64(&mut self, inst: impl Clone + Into, template: Template) { - self.enc32(inst.clone(), template.clone()); - self.enc64(inst, template); - } - /// Add the same encoding/recipe pairing to both X86_32 and X86_64 fn enc_32_64_rec( &mut self, @@ -1674,17 +1685,15 @@ fn define_simd( // PSHUFB, 8-bit shuffle using two XMM registers. for ty in ValueType::all_lane_types().filter(allowed_simd_type) { let instruction = x86_pshufb.bind(vector(ty, sse_vector_size)); - let template = rec_fa.nonrex().opcodes(&PSHUFB); - e.enc32_isap(instruction.clone(), template.clone(), use_ssse3_simd); - e.enc64_isap(instruction, template, use_ssse3_simd); + let template = rec_fa.opcodes(&PSHUFB); + e.enc_both_inferred_maybe_isap(instruction.clone(), template.clone(), Some(use_ssse3_simd)); } // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate. for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) { let instruction = x86_pshufd.bind(vector(ty, sse_vector_size)); - let template = rec_r_ib_unsigned_fpr.nonrex().opcodes(&PSHUFD); - e.enc32(instruction.clone(), template.clone()); - e.enc64(instruction, template); + let template = rec_r_ib_unsigned_fpr.opcodes(&PSHUFD); + e.enc_both_inferred(instruction, template); } // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according @@ -1693,12 +1702,12 @@ fn define_simd( for ty in ValueType::all_lane_types().filter(allowed_simd_type) { let instruction = scalar_to_vector.bind(vector(ty, sse_vector_size)); if ty.is_float() { + // No need to move floats--they already live in XMM registers. e.enc_32_64_rec(instruction, rec_null_fpr, 0); } else { let template = rec_frurm.opcodes(&MOVD_LOAD_XMM); if ty.lane_bits() < 64 { - e.enc32(instruction.clone(), template.clone()); - e.enc_x86_64(instruction, template); + e.enc_both_inferred(instruction, template); } else { // No 32-bit encodings for 64-bit widths. assert_eq!(ty.lane_bits(), 64); @@ -1719,7 +1728,7 @@ fn define_simd( let instruction = x86_pinsr.bind(vector(ty, sse_vector_size)); let template = rec_r_ib_unsigned_r.opcodes(opcode); if ty.lane_bits() < 64 { - e.enc_32_64_maybe_isap(instruction, template.nonrex(), isap); + e.enc_both_inferred_maybe_isap(instruction, template, isap); } else { // It turns out the 64-bit widths have REX/W encodings and only are available on // x86_64. @@ -1730,22 +1739,22 @@ fn define_simd( // For legalizing insertlane with floats, INSERTPS from SSE4.1. { let instruction = x86_insertps.bind(vector(F32, sse_vector_size)); - let template = rec_fa_ib.nonrex().opcodes(&INSERTPS); - e.enc_32_64_maybe_isap(instruction, template, Some(use_sse41_simd)); + let template = rec_fa_ib.opcodes(&INSERTPS); + e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd)); } // For legalizing insertlane with floats, MOVSD from SSE2. { let instruction = x86_movsd.bind(vector(F64, sse_vector_size)); - let template = rec_fa.nonrex().opcodes(&MOVSD_LOAD); - e.enc_32_64_maybe_isap(instruction, template, None); // from SSE2 + let template = rec_fa.opcodes(&MOVSD_LOAD); + e.enc_both_inferred(instruction, template); // from SSE2 } // For legalizing insertlane with floats, MOVLHPS from SSE. { let instruction = x86_movlhps.bind(vector(F64, sse_vector_size)); - let template = rec_fa.nonrex().opcodes(&MOVLHPS); - e.enc_32_64_maybe_isap(instruction, template, None); // from SSE + let template = rec_fa.opcodes(&MOVLHPS); + e.enc_both_inferred(instruction, template); // from SSE } // SIMD extractlane @@ -1760,7 +1769,7 @@ fn define_simd( let instruction = x86_pextr.bind(vector(ty, sse_vector_size)); let template = rec_r_ib_unsigned_gpr.opcodes(opcode); if ty.lane_bits() < 64 { - e.enc_32_64_maybe_isap(instruction, template.nonrex(), Some(use_sse41_simd)); + e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd)); } else { // It turns out the 64-bit widths have REX/W encodings and only are available on // x86_64. @@ -1838,85 +1847,81 @@ fn define_simd( // in memory) but some performance measurements are needed. for ty in ValueType::all_lane_types().filter(allowed_simd_type) { let instruction = vconst.bind(vector(ty, sse_vector_size)); - let template = rec_vconst.opcodes(&MOVUPS_LOAD).infer_rex(); - e.enc_32_64_maybe_isap(instruction, template, None); // from SSE + let template = rec_vconst.opcodes(&MOVUPS_LOAD); + e.enc_both_inferred(instruction, template); // from SSE } - // SIMD register movement: store, load, spill, fill, regmove. All of these use encodings of + // SIMD register movement: store, load, spill, fill, regmove, etc. All of these use encodings of // MOVUPS and MOVAPS from SSE (TODO ideally all of these would either use MOVAPS when we have // alignment or type-specific encodings, see https://github.com/bytecodealliance/wasmtime/issues/1124). + // Also, it would be ideal to infer REX prefixes for all of these instructions but for the + // time being only instructions with common recipes have `infer_rex()` support. for ty in ValueType::all_lane_types().filter(allowed_simd_type) { // Store let bound_store = store.bind(vector(ty, sse_vector_size)).bind(Any); - e.enc_32_64( - bound_store.clone(), - rec_fst.opcodes(&MOVUPS_STORE).infer_rex(), - ); - e.enc_32_64(bound_store.clone(), rec_fstDisp8.opcodes(&MOVUPS_STORE)); - e.enc_32_64(bound_store, rec_fstDisp32.opcodes(&MOVUPS_STORE)); + e.enc_both_inferred(bound_store.clone(), rec_fst.opcodes(&MOVUPS_STORE)); + e.enc_both(bound_store.clone(), rec_fstDisp8.opcodes(&MOVUPS_STORE)); + e.enc_both(bound_store, rec_fstDisp32.opcodes(&MOVUPS_STORE)); // Store complex let bound_store_complex = store_complex.bind(vector(ty, sse_vector_size)); - e.enc_32_64( + e.enc_both( bound_store_complex.clone(), rec_fstWithIndex.opcodes(&MOVUPS_STORE), ); - e.enc_32_64( + e.enc_both( bound_store_complex.clone(), rec_fstWithIndexDisp8.opcodes(&MOVUPS_STORE), ); - e.enc_32_64( + e.enc_both( bound_store_complex, rec_fstWithIndexDisp32.opcodes(&MOVUPS_STORE), ); // Load let bound_load = load.bind(vector(ty, sse_vector_size)).bind(Any); - e.enc_32_64( - bound_load.clone(), - rec_fld.opcodes(&MOVUPS_LOAD).infer_rex(), - ); - e.enc_32_64(bound_load.clone(), rec_fldDisp8.opcodes(&MOVUPS_LOAD)); - e.enc_32_64(bound_load, rec_fldDisp32.opcodes(&MOVUPS_LOAD)); + e.enc_both_inferred(bound_load.clone(), rec_fld.opcodes(&MOVUPS_LOAD)); + e.enc_both(bound_load.clone(), rec_fldDisp8.opcodes(&MOVUPS_LOAD)); + e.enc_both(bound_load, rec_fldDisp32.opcodes(&MOVUPS_LOAD)); // Load complex let bound_load_complex = load_complex.bind(vector(ty, sse_vector_size)); - e.enc_32_64( + e.enc_both( bound_load_complex.clone(), rec_fldWithIndex.opcodes(&MOVUPS_LOAD), ); - e.enc_32_64( + e.enc_both( bound_load_complex.clone(), rec_fldWithIndexDisp8.opcodes(&MOVUPS_LOAD), ); - e.enc_32_64( + e.enc_both( bound_load_complex, rec_fldWithIndexDisp32.opcodes(&MOVUPS_LOAD), ); // Spill let bound_spill = spill.bind(vector(ty, sse_vector_size)); - e.enc_32_64(bound_spill, rec_fspillSib32.opcodes(&MOVUPS_STORE)); + e.enc_both(bound_spill, rec_fspillSib32.opcodes(&MOVUPS_STORE)); let bound_regspill = regspill.bind(vector(ty, sse_vector_size)); - e.enc_32_64(bound_regspill, rec_fregspill32.opcodes(&MOVUPS_STORE)); + e.enc_both(bound_regspill, rec_fregspill32.opcodes(&MOVUPS_STORE)); // Fill let bound_fill = fill.bind(vector(ty, sse_vector_size)); - e.enc_32_64(bound_fill, rec_ffillSib32.opcodes(&MOVUPS_LOAD)); + e.enc_both(bound_fill, rec_ffillSib32.opcodes(&MOVUPS_LOAD)); let bound_regfill = regfill.bind(vector(ty, sse_vector_size)); - e.enc_32_64(bound_regfill, rec_fregfill32.opcodes(&MOVUPS_LOAD)); + e.enc_both(bound_regfill, rec_fregfill32.opcodes(&MOVUPS_LOAD)); let bound_fill_nop = fill_nop.bind(vector(ty, sse_vector_size)); e.enc_32_64_rec(bound_fill_nop, rec_ffillnull, 0); // Regmove let bound_regmove = regmove.bind(vector(ty, sse_vector_size)); - e.enc_32_64(bound_regmove, rec_frmov.opcodes(&MOVAPS_LOAD)); + e.enc_both(bound_regmove, rec_frmov.opcodes(&MOVAPS_LOAD)); // Copy let bound_copy = copy.bind(vector(ty, sse_vector_size)); - e.enc_32_64(bound_copy, rec_furm.opcodes(&MOVAPS_LOAD)); + e.enc_both(bound_copy, rec_furm.opcodes(&MOVAPS_LOAD)); let bound_copy_to_ssa = copy_to_ssa.bind(vector(ty, sse_vector_size)); - e.enc_32_64(bound_copy_to_ssa, rec_furm_reg_to_ssa.opcodes(&MOVAPS_LOAD)); + e.enc_both(bound_copy_to_ssa, rec_furm_reg_to_ssa.opcodes(&MOVAPS_LOAD)); let bound_copy_nop = copy_nop.bind(vector(ty, sse_vector_size)); e.enc_32_64_rec(bound_copy_nop, rec_stacknull, 0); } @@ -1924,23 +1929,23 @@ fn define_simd( // SIMD integer addition for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] { let iadd = iadd.bind(vector(*ty, sse_vector_size)); - e.enc_32_64(iadd, rec_fa.opcodes(*opcodes)); + e.enc_both_inferred(iadd, rec_fa.opcodes(*opcodes)); } // SIMD integer saturating addition - e.enc_32_64( + e.enc_both_inferred( sadd_sat.bind(vector(I8, sse_vector_size)), rec_fa.opcodes(&PADDSB), ); - e.enc_32_64( + e.enc_both_inferred( sadd_sat.bind(vector(I16, sse_vector_size)), rec_fa.opcodes(&PADDSW), ); - e.enc_32_64( + e.enc_both_inferred( uadd_sat.bind(vector(I8, sse_vector_size)), rec_fa.opcodes(&PADDUSB), ); - e.enc_32_64( + e.enc_both_inferred( uadd_sat.bind(vector(I16, sse_vector_size)), rec_fa.opcodes(&PADDUSW), ); @@ -1949,23 +1954,23 @@ fn define_simd( let isub = shared.by_name("isub"); for (ty, opcodes) in &[(I8, &PSUBB), (I16, &PSUBW), (I32, &PSUBD), (I64, &PSUBQ)] { let isub = isub.bind(vector(*ty, sse_vector_size)); - e.enc_32_64(isub, rec_fa.opcodes(*opcodes)); + e.enc_both_inferred(isub, rec_fa.opcodes(*opcodes)); } // SIMD integer saturating subtraction - e.enc_32_64( + e.enc_both_inferred( ssub_sat.bind(vector(I8, sse_vector_size)), rec_fa.opcodes(&PSUBSB), ); - e.enc_32_64( + e.enc_both_inferred( ssub_sat.bind(vector(I16, sse_vector_size)), rec_fa.opcodes(&PSUBSW), ); - e.enc_32_64( + e.enc_both_inferred( usub_sat.bind(vector(I8, sse_vector_size)), rec_fa.opcodes(&PSUBUSB), ); - e.enc_32_64( + e.enc_both_inferred( usub_sat.bind(vector(I16, sse_vector_size)), rec_fa.opcodes(&PSUBUSW), ); @@ -1977,7 +1982,7 @@ fn define_simd( (I32, &PMULLD[..], Some(use_sse41_simd)), ] { let imul = imul.bind(vector(*ty, sse_vector_size)); - e.enc_32_64_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap); + e.enc_both_inferred_maybe_isap(imul, rec_fa.opcodes(opcodes), *isap); } // SIMD integer multiplication for I64x2 using a AVX512. @@ -1993,7 +1998,7 @@ fn define_simd( // SIMD integer average with rounding. for (ty, opcodes) in &[(I8, &PAVGB[..]), (I16, &PAVGW[..])] { let avgr = avg_round.bind(vector(*ty, sse_vector_size)); - e.enc_32_64(avgr, rec_fa.opcodes(opcodes)); + e.enc_both_inferred(avgr, rec_fa.opcodes(opcodes)); } // SIMD logical operations @@ -2002,23 +2007,23 @@ fn define_simd( for ty in ValueType::all_lane_types().filter(allowed_simd_type) { // and let band = band.bind(vector(ty, sse_vector_size)); - e.enc_32_64(band, rec_fa.opcodes(&PAND)); + e.enc_both_inferred(band, rec_fa.opcodes(&PAND)); // and not (note flipped recipe operands to match band_not order) let band_not = band_not.bind(vector(ty, sse_vector_size)); - e.enc_32_64(band_not, rec_fax.opcodes(&PANDN)); + e.enc_both_inferred(band_not, rec_fax.opcodes(&PANDN)); // or let bor = bor.bind(vector(ty, sse_vector_size)); - e.enc_32_64(bor, rec_fa.opcodes(&POR)); + e.enc_both_inferred(bor, rec_fa.opcodes(&POR)); // xor let bxor = bxor.bind(vector(ty, sse_vector_size)); - e.enc_32_64(bxor, rec_fa.opcodes(&PXOR)); + e.enc_both_inferred(bxor, rec_fa.opcodes(&PXOR)); // ptest let x86_ptest = x86_ptest.bind(vector(ty, sse_vector_size)); - e.enc_32_64_maybe_isap(x86_ptest, rec_fcmp.opcodes(&PTEST), Some(use_sse41_simd)); + e.enc_both_inferred_maybe_isap(x86_ptest, rec_fcmp.opcodes(&PTEST), Some(use_sse41_simd)); } // SIMD bitcast from I32/I64 to the low bits of a vector (e.g. I64x2); this register movement @@ -2026,7 +2031,7 @@ fn define_simd( // I128x1 but restrictions on the type builder prevent this; the general idea here is that // the upper bits are all zeroed and do not form parts of any separate lane. See // https://github.com/bytecodealliance/wasmtime/issues/1140. - e.enc_both( + e.enc_both_inferred( bitcast.bind(vector(I64, sse_vector_size)).bind(I32), rec_frurm.opcodes(&MOVD_LOAD_XMM), ); @@ -2038,31 +2043,31 @@ fn define_simd( // SIMD shift left for (ty, opcodes) in &[(I16, &PSLLW), (I32, &PSLLD), (I64, &PSLLQ)] { let x86_psll = x86_psll.bind(vector(*ty, sse_vector_size)); - e.enc_32_64(x86_psll, rec_fa.opcodes(*opcodes)); + e.enc_both_inferred(x86_psll, rec_fa.opcodes(*opcodes)); } // SIMD shift right (logical) for (ty, opcodes) in &[(I16, &PSRLW), (I32, &PSRLD), (I64, &PSRLQ)] { let x86_psrl = x86_psrl.bind(vector(*ty, sse_vector_size)); - e.enc_32_64(x86_psrl, rec_fa.opcodes(*opcodes)); + e.enc_both_inferred(x86_psrl, rec_fa.opcodes(*opcodes)); } // SIMD shift right (arithmetic) for (ty, opcodes) in &[(I16, &PSRAW), (I32, &PSRAD)] { let x86_psra = x86_psra.bind(vector(*ty, sse_vector_size)); - e.enc_32_64(x86_psra, rec_fa.opcodes(*opcodes)); + e.enc_both_inferred(x86_psra, rec_fa.opcodes(*opcodes)); } // SIMD immediate shift for (ty, opcodes) in &[(I16, &PS_W_IMM), (I32, &PS_D_IMM), (I64, &PS_Q_IMM)] { let ishl_imm = ishl_imm.bind(vector(*ty, sse_vector_size)); - e.enc_32_64(ishl_imm, rec_f_ib.opcodes(*opcodes).rrr(6)); + e.enc_both_inferred(ishl_imm, rec_f_ib.opcodes(*opcodes).rrr(6)); let ushr_imm = ushr_imm.bind(vector(*ty, sse_vector_size)); - e.enc_32_64(ushr_imm, rec_f_ib.opcodes(*opcodes).rrr(2)); + e.enc_both_inferred(ushr_imm, rec_f_ib.opcodes(*opcodes).rrr(2)); let sshr_imm = sshr_imm.bind(vector(*ty, sse_vector_size)); - e.enc_32_64(sshr_imm, rec_f_ib.opcodes(*opcodes).rrr(4)); + e.enc_both_inferred(sshr_imm, rec_f_ib.opcodes(*opcodes).rrr(4)); } // SIMD integer comparisons @@ -2081,8 +2086,8 @@ fn define_simd( let instruction = icmp .bind(Immediate::IntCC(*cc)) .bind(vector(*ty, sse_vector_size)); - let template = rec_icscc_fpr.nonrex().opcodes(opcodes); - e.enc_32_64_maybe_isap(instruction, template, *isa_predicate); + let template = rec_icscc_fpr.opcodes(opcodes); + e.enc_both_inferred_maybe_isap(instruction, template, *isa_predicate); } } @@ -2102,15 +2107,15 @@ fn define_simd( (I32, x86_pminu, &PMINUD[..], Some(use_sse41_simd)), ] { let inst = inst.bind(vector(*ty, sse_vector_size)); - e.enc_32_64_maybe_isap(inst, rec_fa.opcodes(opcodes), *isa_predicate); + e.enc_both_inferred_maybe_isap(inst, rec_fa.opcodes(opcodes), *isa_predicate); } // SIMD float comparisons - e.enc_both( + e.enc_both_inferred( fcmp.bind(vector(F32, sse_vector_size)), rec_pfcmp.opcodes(&CMPPS), ); - e.enc_both( + e.enc_both_inferred( fcmp.bind(vector(F64, sse_vector_size)), rec_pfcmp.opcodes(&CMPPD), ); @@ -2131,11 +2136,11 @@ fn define_simd( (F64, fmax, &MAXPD[..]), ] { let inst = inst.bind(vector(*ty, sse_vector_size)); - e.enc_both(inst, rec_fa.opcodes(opcodes)); + e.enc_both_inferred(inst, rec_fa.opcodes(opcodes)); } for (ty, inst, opcodes) in &[(F32, sqrt, &SQRTPS[..]), (F64, sqrt, &SQRTPD[..])] { let inst = inst.bind(vector(*ty, sse_vector_size)); - e.enc_both(inst, rec_furm.opcodes(opcodes)); + e.enc_both_inferred(inst, rec_furm.opcodes(opcodes)); } } diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs index b136f514b1..daf1415120 100644 --- a/cranelift/codegen/meta/src/isa/x86/recipes.rs +++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs @@ -547,41 +547,35 @@ pub(crate) fn define<'shared>( ); // XX /r - recipes.add_template( - Template::new( - EncodingRecipeBuilder::new("rr", &formats.binary, 1) - .operands_in(vec![gpr, gpr]) - .operands_out(vec![0]) - .emit( - r#" + recipes.add_template_inferred( + EncodingRecipeBuilder::new("rr", &formats.binary, 1) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![0]) + .emit( + r#" {{PUT_OP}}(bits, rex2(in_reg0, in_reg1), sink); modrm_rr(in_reg0, in_reg1, sink); "#, - ), - regs, - ) - .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"), + ), + "size_with_inferred_rex_for_inreg0_inreg1", ); // XX /r with operands swapped. (RM form). - recipes.add_template( - Template::new( - EncodingRecipeBuilder::new("rrx", &formats.binary, 1) - .operands_in(vec![gpr, gpr]) - .operands_out(vec![0]) - .emit( - r#" + recipes.add_template_inferred( + EncodingRecipeBuilder::new("rrx", &formats.binary, 1) + .operands_in(vec![gpr, gpr]) + .operands_out(vec![0]) + .emit( + r#" {{PUT_OP}}(bits, rex2(in_reg1, in_reg0), sink); modrm_rr(in_reg1, in_reg0, sink); "#, - ), - regs, - ) - .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"), + ), + "size_with_inferred_rex_for_inreg0_inreg1", ); // XX /r with FPR ins and outs. A form. - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("fa", &formats.binary, 1) .operands_in(vec![fpr, fpr]) .operands_out(vec![0]) @@ -591,10 +585,11 @@ pub(crate) fn define<'shared>( modrm_rr(in_reg1, in_reg0, sink); "#, ), + "size_with_inferred_rex_for_inreg0_inreg1", ); // XX /r with FPR ins and outs. A form with input operands swapped. - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("fax", &formats.binary, 1) .operands_in(vec![fpr, fpr]) .operands_out(vec![1]) @@ -604,11 +599,13 @@ pub(crate) fn define<'shared>( modrm_rr(in_reg0, in_reg1, sink); "#, ), + // The operand order does not matter for calculating whether a REX prefix is needed. + "size_with_inferred_rex_for_inreg0_inreg1", ); // XX /r with FPR ins and outs. A form with a byte immediate. { - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("fa_ib", &formats.insert_lane, 2) .operands_in(vec![fpr, fpr]) .operands_out(vec![0]) @@ -626,6 +623,7 @@ pub(crate) fn define<'shared>( sink.put1(imm as u8); "#, ), + "size_with_inferred_rex_for_inreg0_inreg1", ); } @@ -740,7 +738,7 @@ pub(crate) fn define<'shared>( ); // XX /r, RM form, FPR -> FPR. - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("furm", &formats.unary, 1) .operands_in(vec![fpr]) .operands_out(vec![fpr]) @@ -751,6 +749,7 @@ pub(crate) fn define<'shared>( modrm_rr(in_reg0, out_reg0, sink); "#, ), + "size_with_inferred_rex_for_inreg0_outreg0", ); // Same as furm, but with the source register specified directly. @@ -768,21 +767,18 @@ pub(crate) fn define<'shared>( ); // XX /r, RM form, GPR -> FPR. - recipes.add_template( - Template::new( - EncodingRecipeBuilder::new("frurm", &formats.unary, 1) - .operands_in(vec![gpr]) - .operands_out(vec![fpr]) - .clobbers_flags(false) - .emit( - r#" + recipes.add_template_inferred( + EncodingRecipeBuilder::new("frurm", &formats.unary, 1) + .operands_in(vec![gpr]) + .operands_out(vec![fpr]) + .clobbers_flags(false) + .emit( + r#" {{PUT_OP}}(bits, rex2(in_reg0, out_reg0), sink); modrm_rr(in_reg0, out_reg0, sink); "#, - ), - regs, - ) - .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_outreg0"), + ), + "size_with_inferred_rex_for_inreg0_outreg0", ); // XX /r, RM form, FPR -> GPR. @@ -909,31 +905,28 @@ pub(crate) fn define<'shared>( // XX /n ib with 8-bit immediate sign-extended. { - recipes.add_template( - Template::new( - EncodingRecipeBuilder::new("r_ib", &formats.binary_imm, 2) - .operands_in(vec![gpr]) - .operands_out(vec![0]) - .inst_predicate(InstructionPredicate::new_is_signed_int( - &*formats.binary_imm, - "imm", - 8, - 0, - )) - .emit( - r#" + recipes.add_template_inferred( + EncodingRecipeBuilder::new("r_ib", &formats.binary_imm, 2) + .operands_in(vec![gpr]) + .operands_out(vec![0]) + .inst_predicate(InstructionPredicate::new_is_signed_int( + &*formats.binary_imm, + "imm", + 8, + 0, + )) + .emit( + r#" {{PUT_OP}}(bits, rex1(in_reg0), sink); modrm_r_bits(in_reg0, bits, sink); let imm: i64 = imm.into(); sink.put1(imm as u8); "#, - ), - regs, - ) - .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0"), + ), + "size_with_inferred_rex_for_inreg0", ); - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("f_ib", &formats.binary_imm, 2) .operands_in(vec![fpr]) .operands_out(vec![0]) @@ -951,6 +944,7 @@ pub(crate) fn define<'shared>( sink.put1(imm as u8); "#, ), + "size_with_inferred_rex_for_inreg0", ); // XX /n id with 32-bit immediate sign-extended. @@ -981,7 +975,7 @@ pub(crate) fn define<'shared>( // XX /r ib with 8-bit unsigned immediate (e.g. for pshufd) { - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("r_ib_unsigned_fpr", &formats.extract_lane, 2) .operands_in(vec![fpr]) .operands_out(vec![fpr]) @@ -999,12 +993,13 @@ pub(crate) fn define<'shared>( sink.put1(imm as u8); "#, ), + "size_with_inferred_rex_for_inreg0_outreg0", ); } // XX /r ib with 8-bit unsigned immediate (e.g. for extractlane) { - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("r_ib_unsigned_gpr", &formats.extract_lane, 2) .operands_in(vec![fpr]) .operands_out(vec![gpr]) @@ -1018,13 +1013,13 @@ pub(crate) fn define<'shared>( let imm:i64 = lane.into(); sink.put1(imm as u8); "#, - ), + ), "size_with_inferred_rex_for_inreg0_outreg0" ); } // XX /r ib with 8-bit unsigned immediate (e.g. for insertlane) { - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("r_ib_unsigned_r", &formats.insert_lane, 2) .operands_in(vec![fpr, gpr]) .operands_out(vec![0]) @@ -1042,6 +1037,7 @@ pub(crate) fn define<'shared>( sink.put1(imm as u8); "#, ), + "size_with_inferred_rex_for_inreg0_inreg1", ); } @@ -2825,7 +2821,7 @@ pub(crate) fn define<'shared>( ); // XX /r, RM form. Compare two FPR registers and set flags. - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("fcmp", &formats.binary, 1) .operands_in(vec![fpr, fpr]) .operands_out(vec![reg_rflags]) @@ -2835,6 +2831,7 @@ pub(crate) fn define<'shared>( modrm_rr(in_reg1, in_reg0, sink); "#, ), + "size_with_inferred_rex_for_inreg0_inreg1", ); { @@ -3089,7 +3086,7 @@ pub(crate) fn define<'shared>( .inferred_rex_compute_size("size_with_inferred_rex_for_inreg0_inreg1"), ); - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("icscc_fpr", &formats.int_compare, 1) .operands_in(vec![fpr, fpr]) .operands_out(vec![0]) @@ -3100,6 +3097,7 @@ pub(crate) fn define<'shared>( modrm_rr(in_reg1, in_reg0, sink); "#, ), + "size_with_inferred_rex_for_inreg0_inreg1", ); { @@ -3219,7 +3217,7 @@ pub(crate) fn define<'shared>( .iter() .map(|name| Literal::enumerator_for(floatcc, name)) .collect(); - recipes.add_template_recipe( + recipes.add_template_inferred( EncodingRecipeBuilder::new("pfcmp", &formats.float_compare, 2) .operands_in(vec![fpr, fpr]) .operands_out(vec![0]) @@ -3248,6 +3246,7 @@ pub(crate) fn define<'shared>( sink.put1(imm); "#, ), + "size_with_inferred_rex_for_inreg0_inreg1", ); } diff --git a/cranelift/codegen/src/isa/x86/binemit.rs b/cranelift/codegen/src/isa/x86/binemit.rs index 615af47668..a92589e632 100644 --- a/cranelift/codegen/src/isa/x86/binemit.rs +++ b/cranelift/codegen/src/isa/x86/binemit.rs @@ -197,7 +197,7 @@ fn put_dynrexmp2(bits: u16, rex: u8, sink: &mut CS) { sink.put1(bits as u8); } -// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix. +/// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix. fn put_mp3(bits: u16, rex: u8, sink: &mut CS) { debug_assert_eq!(bits & 0x8800, 0x0800, "Invalid encoding bits for Mp3*"); debug_assert_eq!(rex, BASE_REX, "Invalid registers for REX-less Mp3 encoding"); @@ -208,7 +208,7 @@ fn put_mp3(bits: u16, rex: u8, sink: &mut CS) { sink.put1(bits as u8); } -// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix and REX +/// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix and REX fn put_rexmp3(bits: u16, rex: u8, sink: &mut CS) { debug_assert_eq!(bits & 0x0800, 0x0800, "Invalid encoding bits for RexMp3*"); let enc = EncodingBits::from(bits); @@ -219,6 +219,23 @@ fn put_rexmp3(bits: u16, rex: u8, sink: &mut CS) { sink.put1(bits as u8); } +/// Emit three-byte opcode (0F 3[8A] XX) with mandatory prefix and an inferred REX prefix. +fn put_dynrexmp3(bits: u16, rex: u8, sink: &mut CS) { + debug_assert_eq!( + bits & 0x0800, + 0x0800, + "Invalid encoding bits for DynRexMp3*" + ); + let enc = EncodingBits::from(bits); + sink.put1(PREFIX[(enc.pp() - 1) as usize]); + if needs_rex(bits, rex) { + rex_prefix(bits, rex, sink); + } + sink.put1(0x0f); + sink.put1(OP3_BYTE2[(enc.mm() - 2) as usize]); + sink.put1(bits as u8); +} + /// Defines the EVEX context for the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte). Table 2-36 in /// section 2.6.10 (Intel Software Development Manual, volume 2A) describes how these bits can be /// used together for certain classes of instructions; i.e., special care should be taken to ensure diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif index f4b969c90d..6e5f7520e3 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-binemit.clif @@ -176,25 +176,49 @@ block0: function %float_arithmetic_f32x4(f32x4, f32x4) { block0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm5]): -[-, %xmm3] v2 = fadd v0, v1 ; bin: 40 0f 58 dd -[-, %xmm3] v3 = fsub v0, v1 ; bin: 40 0f 5c dd -[-, %xmm3] v4 = fmul v0, v1 ; bin: 40 0f 59 dd -[-, %xmm3] v5 = fdiv v0, v1 ; bin: 40 0f 5e dd -[-, %xmm3] v6 = fmin v0, v1 ; bin: 40 0f 5d dd -[-, %xmm3] v7 = fmax v0, v1 ; bin: 40 0f 5f dd -[-, %xmm3] v8 = sqrt v0 ; bin: 40 0f 51 db +[-, %xmm3] v2 = fadd v0, v1 ; bin: 0f 58 dd +[-, %xmm3] v3 = fsub v0, v1 ; bin: 0f 5c dd +[-, %xmm3] v4 = fmul v0, v1 ; bin: 0f 59 dd +[-, %xmm3] v5 = fdiv v0, v1 ; bin: 0f 5e dd +[-, %xmm3] v6 = fmin v0, v1 ; bin: 0f 5d dd +[-, %xmm3] v7 = fmax v0, v1 ; bin: 0f 5f dd +[-, %xmm3] v8 = sqrt v0 ; bin: 0f 51 db + return +} + +function %float_arithmetic_f32x4_rex(f32x4, f32x4) { +block0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm10]): +[-, %xmm3] v2 = fadd v0, v1 ; bin: 41 0f 58 da +[-, %xmm3] v3 = fsub v0, v1 ; bin: 41 0f 5c da +[-, %xmm3] v4 = fmul v0, v1 ; bin: 41 0f 59 da +[-, %xmm3] v5 = fdiv v0, v1 ; bin: 41 0f 5e da +[-, %xmm3] v6 = fmin v0, v1 ; bin: 41 0f 5d da +[-, %xmm3] v7 = fmax v0, v1 ; bin: 41 0f 5f da +[-, %xmm3] v8 = sqrt v1 ; bin: 41 0f 51 da return } function %float_arithmetic_f64x2(f64x2, f64x2) { block0(v0: f64x2 [%xmm3], v1: f64x2 [%xmm5]): -[-, %xmm3] v2 = fadd v0, v1 ; bin: 66 40 0f 58 dd -[-, %xmm3] v3 = fsub v0, v1 ; bin: 66 40 0f 5c dd -[-, %xmm3] v4 = fmul v0, v1 ; bin: 66 40 0f 59 dd -[-, %xmm3] v5 = fdiv v0, v1 ; bin: 66 40 0f 5e dd -[-, %xmm3] v6 = fmin v0, v1 ; bin: 66 40 0f 5d dd -[-, %xmm3] v7 = fmax v0, v1 ; bin: 66 40 0f 5f dd -[-, %xmm3] v8 = sqrt v0 ; bin: 66 40 0f 51 db +[-, %xmm3] v2 = fadd v0, v1 ; bin: 66 0f 58 dd +[-, %xmm3] v3 = fsub v0, v1 ; bin: 66 0f 5c dd +[-, %xmm3] v4 = fmul v0, v1 ; bin: 66 0f 59 dd +[-, %xmm3] v5 = fdiv v0, v1 ; bin: 66 0f 5e dd +[-, %xmm3] v6 = fmin v0, v1 ; bin: 66 0f 5d dd +[-, %xmm3] v7 = fmax v0, v1 ; bin: 66 0f 5f dd +[-, %xmm3] v8 = sqrt v0 ; bin: 66 0f 51 db + return +} + +function %float_arithmetic_f64x2_rex(f64x2, f64x2) { +block0(v0: f64x2 [%xmm11], v1: f64x2 [%xmm13]): +[-, %xmm11] v2 = fadd v0, v1 ; bin: 66 45 0f 58 dd +[-, %xmm11] v3 = fsub v0, v1 ; bin: 66 45 0f 5c dd +[-, %xmm11] v4 = fmul v0, v1 ; bin: 66 45 0f 59 dd +[-, %xmm11] v5 = fdiv v0, v1 ; bin: 66 45 0f 5e dd +[-, %xmm11] v6 = fmin v0, v1 ; bin: 66 45 0f 5d dd +[-, %xmm11] v7 = fmax v0, v1 ; bin: 66 45 0f 5f dd +[-, %xmm11] v8 = sqrt v0 ; bin: 66 45 0f 51 db return } diff --git a/cranelift/filetests/filetests/isa/x86/simd-comparison-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-comparison-binemit.clif index c5463491fe..70c1f2f8e5 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-comparison-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-comparison-binemit.clif @@ -87,26 +87,52 @@ block0(v0: i32x4 [%xmm2], v1: i32x4 [%xmm4]): function %fcmp_f32x4(f32x4, f32x4) { block0(v0: f32x4 [%xmm2], v1: f32x4 [%xmm4]): -[-, %xmm2] v2 = fcmp eq v0, v1 ; bin: 40 0f c2 d4 00 -[-, %xmm2] v3 = fcmp lt v0, v1 ; bin: 40 0f c2 d4 01 -[-, %xmm2] v4 = fcmp le v0, v1 ; bin: 40 0f c2 d4 02 -[-, %xmm2] v5 = fcmp uno v0, v1 ; bin: 40 0f c2 d4 03 -[-, %xmm2] v6 = fcmp ne v0, v1 ; bin: 40 0f c2 d4 04 -[-, %xmm2] v7 = fcmp uge v0, v1 ; bin: 40 0f c2 d4 05 -[-, %xmm2] v8 = fcmp ugt v0, v1 ; bin: 40 0f c2 d4 06 -[-, %xmm2] v9 = fcmp ord v0, v1 ; bin: 40 0f c2 d4 07 +[-, %xmm2] v2 = fcmp eq v0, v1 ; bin: 0f c2 d4 00 +[-, %xmm2] v3 = fcmp lt v0, v1 ; bin: 0f c2 d4 01 +[-, %xmm2] v4 = fcmp le v0, v1 ; bin: 0f c2 d4 02 +[-, %xmm2] v5 = fcmp uno v0, v1 ; bin: 0f c2 d4 03 +[-, %xmm2] v6 = fcmp ne v0, v1 ; bin: 0f c2 d4 04 +[-, %xmm2] v7 = fcmp uge v0, v1 ; bin: 0f c2 d4 05 +[-, %xmm2] v8 = fcmp ugt v0, v1 ; bin: 0f c2 d4 06 +[-, %xmm2] v9 = fcmp ord v0, v1 ; bin: 0f c2 d4 07 + return +} + +function %fcmp_f32x4_rex(f32x4, f32x4) { +block0(v0: f32x4 [%xmm8], v1: f32x4 [%xmm8]): +[-, %xmm8] v2 = fcmp eq v0, v1 ; bin: 45 0f c2 c0 00 +[-, %xmm8] v3 = fcmp lt v0, v1 ; bin: 45 0f c2 c0 01 +[-, %xmm8] v4 = fcmp le v0, v1 ; bin: 45 0f c2 c0 02 +[-, %xmm8] v5 = fcmp uno v0, v1 ; bin: 45 0f c2 c0 03 +[-, %xmm8] v6 = fcmp ne v0, v1 ; bin: 45 0f c2 c0 04 +[-, %xmm8] v7 = fcmp uge v0, v1 ; bin: 45 0f c2 c0 05 +[-, %xmm8] v8 = fcmp ugt v0, v1 ; bin: 45 0f c2 c0 06 +[-, %xmm8] v9 = fcmp ord v0, v1 ; bin: 45 0f c2 c0 07 return } function %fcmp_f64x2(f64x2, f64x2) { block0(v0: f64x2 [%xmm2], v1: f64x2 [%xmm0]): -[-, %xmm2] v2 = fcmp eq v0, v1 ; bin: 66 40 0f c2 d0 00 -[-, %xmm2] v3 = fcmp lt v0, v1 ; bin: 66 40 0f c2 d0 01 -[-, %xmm2] v4 = fcmp le v0, v1 ; bin: 66 40 0f c2 d0 02 -[-, %xmm2] v5 = fcmp uno v0, v1 ; bin: 66 40 0f c2 d0 03 -[-, %xmm2] v6 = fcmp ne v0, v1 ; bin: 66 40 0f c2 d0 04 -[-, %xmm2] v7 = fcmp uge v0, v1 ; bin: 66 40 0f c2 d0 05 -[-, %xmm2] v8 = fcmp ugt v0, v1 ; bin: 66 40 0f c2 d0 06 -[-, %xmm2] v9 = fcmp ord v0, v1 ; bin: 66 40 0f c2 d0 07 +[-, %xmm2] v2 = fcmp eq v0, v1 ; bin: 66 0f c2 d0 00 +[-, %xmm2] v3 = fcmp lt v0, v1 ; bin: 66 0f c2 d0 01 +[-, %xmm2] v4 = fcmp le v0, v1 ; bin: 66 0f c2 d0 02 +[-, %xmm2] v5 = fcmp uno v0, v1 ; bin: 66 0f c2 d0 03 +[-, %xmm2] v6 = fcmp ne v0, v1 ; bin: 66 0f c2 d0 04 +[-, %xmm2] v7 = fcmp uge v0, v1 ; bin: 66 0f c2 d0 05 +[-, %xmm2] v8 = fcmp ugt v0, v1 ; bin: 66 0f c2 d0 06 +[-, %xmm2] v9 = fcmp ord v0, v1 ; bin: 66 0f c2 d0 07 + return +} + +function %fcmp_f64x2_rex(f64x2, f64x2) { +block0(v0: f64x2 [%xmm9], v1: f64x2 [%xmm11]): +[-, %xmm9] v2 = fcmp eq v0, v1 ; bin: 66 45 0f c2 cb 00 +[-, %xmm9] v3 = fcmp lt v0, v1 ; bin: 66 45 0f c2 cb 01 +[-, %xmm9] v4 = fcmp le v0, v1 ; bin: 66 45 0f c2 cb 02 +[-, %xmm9] v5 = fcmp uno v0, v1 ; bin: 66 45 0f c2 cb 03 +[-, %xmm9] v6 = fcmp ne v0, v1 ; bin: 66 45 0f c2 cb 04 +[-, %xmm9] v7 = fcmp uge v0, v1 ; bin: 66 45 0f c2 cb 05 +[-, %xmm9] v8 = fcmp ugt v0, v1 ; bin: 66 45 0f c2 cb 06 +[-, %xmm9] v9 = fcmp ord v0, v1 ; bin: 66 45 0f c2 cb 07 return } diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif index 1406a286c6..bc2f873fe6 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif @@ -81,7 +81,7 @@ block0: function %pshufd() { block0: [-, %rax] v0 = iconst.i32 42 -[-, %xmm0] v1 = scalar_to_vector.i32x4 v0 ; bin: 66 40 0f 6e c0 +[-, %xmm0] v1 = scalar_to_vector.i32x4 v0 ; bin: 66 0f 6e c0 [-, %xmm0] v2 = x86_pshufd v1, 0 ; bin: 66 0f 70 c0 00 return } @@ -89,9 +89,9 @@ block0: function %pshufb() { block0: [-, %rax] v0 = iconst.i8 42 -[-, %xmm0] v1 = scalar_to_vector.i8x16 v0 ; bin: 66 40 0f 6e c0 +[-, %xmm0] v1 = scalar_to_vector.i8x16 v0 ; bin: 66 0f 6e c0 [-, %rbx] v2 = iconst.i8 43 -[-, %xmm4] v3 = scalar_to_vector.i8x16 v2 ; bin: 66 40 0f 6e e3 -[-, %xmm0] v4 = x86_pshufb v1, v3 ; bin: 66 0f 38 00 c4 +[-, %xmm12] v3 = scalar_to_vector.i8x16 v2 ; bin: 66 44 0f 6e e3 +[-, %xmm0] v4 = x86_pshufb v1, v3 ; bin: 66 41 0f 38 00 c4 return } diff --git a/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif index cbec7ab7aa..92d83867d7 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-memory-binemit.clif @@ -8,8 +8,8 @@ block0(v0: i64 [%rax]): [-] store v10, v0 ; bin: heap_oob 0f 11 00 ; use displacement -[-, %xmm0] v11 = load.f32x4 v0+42 ; bin: heap_oob 0f 10 40 2a -[-] store v11, v0+42 ; bin: heap_oob 0f 11 40 2a +[-, %xmm0] v11 = load.f32x4 v0+42 ; bin: heap_oob 40 0f 10 40 2a +[-] store v11, v0+42 ; bin: heap_oob 40 0f 11 40 2a ; use REX prefix [-, %xmm8] v12 = load.i8x16 v0 ; bin: heap_oob 44 0f 10 00 @@ -22,16 +22,17 @@ function %load_store_complex(i64, i64) { block0(v0: i64 [%rax], v1: i64 [%rbx]): ; %xmm1 corresponds to ModR/M 0x04; the 0b100 in the R/M slot indicates a SIB byte follows ; %rax and %rbx form the SIB 0x18 -[-, %xmm1] v10 = load_complex.f64x2 v0+v1 ; bin: heap_oob 0f 10 0c 18 +[-, %xmm1] v10 = load_complex.f64x2 v0+v1 ; bin: heap_oob 40 0f 10 0c 18 ; enabling bit 6 of the ModR/M byte indicates a disp8 follows -[-] store_complex v10, v0+v1+5 ; bin: heap_oob 0f 11 4c 18 05 +[-] store_complex v10, v0+v1+5 ; bin: heap_oob 40 0f 11 4c 18 05 return } function %copy_to_ssa() { block0: -[-, %xmm1] v0 = copy_to_ssa.i64x2 %xmm3 ; bin: 0f 28 cb +[-, %xmm1] v0 = copy_to_ssa.i64x2 %xmm3 ; bin: 40 0f 28 cb +[-, %xmm2] v1 = copy_to_ssa.i64x2 %xmm15 ; bin: 41 0f 28 d7 return }