diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 9ce33817c7..a5df9298e8 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -270,7 +270,7 @@ impl PerCpuModeEncodings { /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened fn enc_32_64_maybe_isap( &mut self, - inst: BoundInstruction, + inst: impl Clone + Into, template: Template, isap: Option, ) { @@ -280,7 +280,7 @@ impl PerCpuModeEncodings { fn enc32_maybe_isap( &mut self, - inst: BoundInstruction, + inst: impl Into, template: Template, isap: Option, ) { @@ -292,7 +292,7 @@ impl PerCpuModeEncodings { fn enc64_maybe_isap( &mut self, - inst: BoundInstruction, + inst: impl Into, template: Template, isap: Option, ) { @@ -432,6 +432,7 @@ pub fn define( let uload8_complex = shared.by_name("uload8_complex"); let ushr = shared.by_name("ushr"); let ushr_imm = shared.by_name("ushr_imm"); + let vconst = shared.by_name("vconst"); let x86_bsf = x86.by_name("x86_bsf"); let x86_bsr = x86.by_name("x86_bsr"); let x86_cvtt2si = x86.by_name("x86_cvtt2si"); @@ -578,6 +579,7 @@ pub fn define( let rec_urm = r.template("urm"); let rec_urm_noflags = r.template("urm_noflags"); let rec_urm_noflags_abcd = r.template("urm_noflags_abcd"); + let rec_vconst = r.template("vconst"); // Predicates shorthands. let all_ones_funcaddrs_and_not_is_pic = @@ -1785,6 +1787,18 @@ pub fn define( } } + // SIMD vconst using MOVUPS + // TODO it would be ideal if eventually this became the more efficient MOVAPS but we would have + // to guarantee that the constants are aligned when emitted and there is currently no mechanism + // for that; alternately, constants could be loaded into XMM registers using a sequence like: + // MOVQ + MOVHPD + MOVQ + MOVLPD (this allows the constants to be immediates instead of stored + // in memory) but some performance measurements are needed. + for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) { + let instruction = vconst.bind_vector_from_lane(ty, sse_vector_size); + let template = rec_vconst.nonrex().opcodes(vec![0x0f, 0x10]); + e.enc_32_64_maybe_isap(instruction, template, None); // from SSE + } + // Reference type instructions // Null references implemented as iconst 0. diff --git a/cranelift/filetests/filetests/isa/x86/compile-vconst.clif b/cranelift/filetests/filetests/isa/x86/compile-vconst.clif new file mode 100644 index 0000000000..c64c9fc503 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/compile-vconst.clif @@ -0,0 +1,16 @@ +test compile +set enable_simd=true +set probestack_enabled=false +target x86_64 haswell + +; use baldrdash calling convention here for simplicity (avoids prologue, epilogue) +function %test_vconst_i32() -> i32x4 baldrdash_system_v { +ebb0: + v0 = vconst.i32x4 0x1234 + return v0 +} + +; check: ebb0: +; nextln: v0 = vconst.i32x4 0x1234 +; nextln: return v0 +; nextln: } diff --git a/cranelift/filetests/filetests/isa/x86/vconst.clif b/cranelift/filetests/filetests/isa/x86/vconst.clif new file mode 100644 index 0000000000..f7b9ce4627 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/vconst.clif @@ -0,0 +1,11 @@ +test binemit +set opt_level=best +set enable_simd +target x86_64 + +function %test_vconst_b8() { +ebb0: +[-, %xmm2] v0 = vconst.b8x16 0x00 ; bin: 0f 10 15 00000008 PCRelRodata4(15) +[-, %xmm3] v1 = vconst.b8x16 0x01 ; bin: 0f 10 1d 00000011 PCRelRodata4(31) + return +}