diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 0f13b95002..56d2dc0f28 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2653,6 +2653,24 @@ (x64_movrm $I64 addr_lo value_lo) (x64_movrm $I64 addr_hi value_hi))))) +;; Slightly optimize the extraction of the first lane from a vector which is +;; stored in memory. In the case the first lane specifically is selected the +;; standard `movss` and `movsd` instructions can be used as-if we're storing a +;; f32 or f64 despite the source perhaps being an integer vector since the +;; result of the instruction is the same. +(rule 2 (lower (store flags + (has_type (ty_32 _) (extractlane value (u8_from_uimm8 0))) + address + offset)) + (side_effect + (x64_movss_store (to_amode flags address offset) value))) +(rule 3 (lower (store flags + (has_type (ty_64 _) (extractlane value (u8_from_uimm8 0))) + address + offset)) + (side_effect + (x64_movsd_store (to_amode flags address offset) value))) + ;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Add mem, reg diff --git a/cranelift/filetests/filetests/isa/x64/extractlane.clif b/cranelift/filetests/filetests/isa/x64/extractlane.clif index 1cbdfbf7d7..bf6ce4cc97 100644 --- a/cranelift/filetests/filetests/isa/x64/extractlane.clif +++ b/cranelift/filetests/filetests/isa/x64/extractlane.clif @@ -151,3 +151,107 @@ block0(v0: f64x2): ; popq %rbp ; retq +function %extract_i32x4_lane0_to_memory(i32x4, i64) { +block0(v0: i32x4, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movss %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movss %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_f32x4_lane0_to_memory(f32x4, i64) { +block0(v0: f32x4, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movss %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movss %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_i64x2_lane0_to_memory(i64x2, i64) { +block0(v0: i64x2, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movsd %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movsd %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + +function %extract_f64x2_lane0_to_memory(f64x2, i64) { +block0(v0: f64x2, v1: i64): + v2 = extractlane v0, 0 + store v2, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movsd %xmm0, 0(%rdi) +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movsd %xmm0, (%rdi) ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq +