diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 807935b805..937df6830e 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1607,8 +1607,11 @@ fn define_simd( let sadd_sat = shared.by_name("sadd_sat"); let scalar_to_vector = shared.by_name("scalar_to_vector"); let sload8x8 = shared.by_name("sload8x8"); + let sload8x8_complex = shared.by_name("sload8x8_complex"); let sload16x4 = shared.by_name("sload16x4"); + let sload16x4_complex = shared.by_name("sload16x4_complex"); let sload32x2 = shared.by_name("sload32x2"); + let sload32x2_complex = shared.by_name("sload32x2_complex"); let spill = shared.by_name("spill"); let sqrt = shared.by_name("sqrt"); let sshr_imm = shared.by_name("sshr_imm"); @@ -1617,8 +1620,11 @@ fn define_simd( let store_complex = shared.by_name("store_complex"); let uadd_sat = shared.by_name("uadd_sat"); let uload8x8 = shared.by_name("uload8x8"); + let uload8x8_complex = shared.by_name("uload8x8_complex"); let uload16x4 = shared.by_name("uload16x4"); + let uload16x4_complex = shared.by_name("uload16x4_complex"); let uload32x2 = shared.by_name("uload32x2"); + let uload32x2_complex = shared.by_name("uload32x2_complex"); let ushr_imm = shared.by_name("ushr_imm"); let usub_sat = shared.by_name("usub_sat"); let vconst = shared.by_name("vconst"); @@ -1983,6 +1989,35 @@ fn define_simd( } } + // SIMD load extend (complex addressing) + let is_load_complex_length_two = + InstructionPredicate::new_length_equals(&*formats.load_complex, 2); + for (inst, opcodes) in &[ + (uload8x8_complex, &PMOVZXBW), + (uload16x4_complex, &PMOVZXWD), + (uload32x2_complex, &PMOVZXDQ), + (sload8x8_complex, &PMOVSXBW), + (sload16x4_complex, &PMOVSXWD), + (sload32x2_complex, &PMOVSXDQ), + ] { + for recipe in &[ + rec_fldWithIndex, + rec_fldWithIndexDisp8, + rec_fldWithIndexDisp32, + ] { + let template = recipe.opcodes(*opcodes); + let predicate = |encoding: EncodingBuilder| { + encoding + .isa_predicate(use_sse41_simd) + .inst_predicate(is_load_complex_length_two.clone()) + }; + e.enc32_func(inst.clone(), template.clone(), predicate); + // No infer_rex calculator for these recipes; place REX version first as in enc_x86_64. + e.enc64_func(inst.clone(), template.rex(), predicate); + e.enc64_func(inst.clone(), template, predicate); + } + } + // SIMD integer addition for (ty, opcodes) in &[(I8, &PADDB), (I16, &PADDW), (I32, &PADDD), (I64, &PADDQ)] { let iadd = iadd.bind(vector(*ty, sse_vector_size)); diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index c9a1de311b..e5919ff96a 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -1172,6 +1172,20 @@ pub(crate) fn define( .can_load(true), ); + ig.push( + Inst::new( + "uload8x8_complex", + r#" + Load an 8x8 vector (64 bits) from memory at ``sum(args) + Offset`` and zero-extend into an + i16x8 vector. + "#, + &formats.load_complex, + ) + .operands_in(vec![MemFlags, args, Offset]) + .operands_out(vec![a]) + .can_load(true), + ); + ig.push( Inst::new( "sload8x8", @@ -1186,6 +1200,20 @@ pub(crate) fn define( .can_load(true), ); + ig.push( + Inst::new( + "sload8x8_complex", + r#" + Load an 8x8 vector (64 bits) from memory at ``sum(args) + Offset`` and sign-extend into an + i16x8 vector. + "#, + &formats.load_complex, + ) + .operands_in(vec![MemFlags, args, Offset]) + .operands_out(vec![a]) + .can_load(true), + ); + let I32x4 = &TypeVar::new( "I32x4", "A SIMD vector with exactly 4 lanes of 32-bit values", @@ -1201,7 +1229,7 @@ pub(crate) fn define( Inst::new( "uload16x4", r#" - Load an 16x4 vector (64 bits) from memory at ``p + Offset`` and zero-extend into an i32x4 + Load a 16x4 vector (64 bits) from memory at ``p + Offset`` and zero-extend into an i32x4 vector. "#, &formats.load, @@ -1211,6 +1239,20 @@ pub(crate) fn define( .can_load(true), ); + ig.push( + Inst::new( + "uload16x4_complex", + r#" + Load a 16x4 vector (64 bits) from memory at ``sum(args) + Offset`` and zero-extend into an + i32x4 vector. + "#, + &formats.load_complex, + ) + .operands_in(vec![MemFlags, args, Offset]) + .operands_out(vec![a]) + .can_load(true), + ); + ig.push( Inst::new( "sload16x4", @@ -1225,6 +1267,20 @@ pub(crate) fn define( .can_load(true), ); + ig.push( + Inst::new( + "sload16x4_complex", + r#" + Load a 16x4 vector (64 bits) from memory at ``sum(args) + Offset`` and sign-extend into an + i32x4 vector. + "#, + &formats.load_complex, + ) + .operands_in(vec![MemFlags, args, Offset]) + .operands_out(vec![a]) + .can_load(true), + ); + let I64x2 = &TypeVar::new( "I64x2", "A SIMD vector with exactly 2 lanes of 64-bit values", @@ -1250,6 +1306,20 @@ pub(crate) fn define( .can_load(true), ); + ig.push( + Inst::new( + "uload32x2_complex", + r#" + Load a 32x2 vector (64 bits) from memory at ``sum(args) + Offset`` and zero-extend into an + i64x2 vector. + "#, + &formats.load_complex, + ) + .operands_in(vec![MemFlags, args, Offset]) + .operands_out(vec![a]) + .can_load(true), + ); + ig.push( Inst::new( "sload32x2", @@ -1264,6 +1334,20 @@ pub(crate) fn define( .can_load(true), ); + ig.push( + Inst::new( + "sload32x2_complex", + r#" + Load a 32x2 vector (64 bits) from memory at ``sum(args) + Offset`` and sign-extend into an + i64x2 vector. + "#, + &formats.load_complex, + ) + .operands_in(vec![MemFlags, args, Offset]) + .operands_out(vec![a]) + .can_load(true), + ); + let x = &Operand::new("x", Mem).with_doc("Value to be stored"); let a = &Operand::new("a", Mem).with_doc("Value loaded"); let Offset = diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 831d722694..46fab7aadb 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1354,11 +1354,17 @@ pub(crate) fn lower_insn_to_regs>(ctx: &mut C, insn: IRIns | Opcode::ScalarToVector | Opcode::Swizzle | Opcode::Uload8x8 + | Opcode::Uload8x8Complex | Opcode::Sload8x8 + | Opcode::Sload8x8Complex | Opcode::Uload16x4 + | Opcode::Uload16x4Complex | Opcode::Sload16x4 + | Opcode::Sload16x4Complex | Opcode::Uload32x2 - | Opcode::Sload32x2 => { + | Opcode::Uload32x2Complex + | Opcode::Sload32x2 + | Opcode::Sload32x2Complex => { // TODO panic!("Vector ops not implemented."); }