diff --git a/build.rs b/build.rs index 835cc8788e..c2524832e2 100644 --- a/build.rs +++ b/build.rs @@ -181,6 +181,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { }, "Cranelift" => match (testsuite, testname) { ("simd", "simd_i8x16_cmp") => return false, + ("simd", "simd_load_extend") => return false, ("simd", "simd_store") => return false, // Most simd tests are known to fail on aarch64 for now, it's going // to be a big chunk of work to implement them all there! diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 60e2480cb0..b6328730a1 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -936,6 +936,21 @@ impl MachInstEmit for Inst { &Inst::FpuMove128 { rd, rn } => { sink.put4(enc_vecmov(/* 16b = */ true, rd, rn)); } + &Inst::FpuMoveFromVec { rd, rn, idx, ty } => { + let (imm5, shift, mask) = match ty { + F32 => (0b00100, 3, 0b011), + F64 => (0b01000, 4, 0b001), + _ => unimplemented!(), + }; + debug_assert_eq!(idx & mask, idx); + let imm5 = imm5 | ((idx as u32) << shift); + sink.put4( + 0b010_11110000_00000_000001_00000_00000 + | (imm5 << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()), + ); + } &Inst::FpuRR { fpu_op, rd, rn } => { let top22 = match fpu_op { FPUOp1::Abs32 => 0b000_11110_00_1_000001_10000, @@ -1142,9 +1157,20 @@ impl MachInstEmit for Inst { | machreg_to_vec(rd.to_reg()), ); } - &Inst::MovFromVec64 { rd, rn } => { + &Inst::MovFromVec { rd, rn, idx, ty } => { + let (q, imm5, shift, mask) = match ty { + I8 => (0b0, 0b00001, 1, 0b1111), + I16 => (0b0, 0b00010, 2, 0b0111), + I32 => (0b0, 0b00100, 3, 0b0011), + I64 => (0b1, 0b01000, 4, 0b0001), + _ => unreachable!(), + }; + debug_assert_eq!(idx & mask, idx); + let imm5 = imm5 | ((idx as u32) << shift); sink.put4( - 0b010_01110000_01000_0_0111_1_00000_00000 + 0b000_01110000_00000_0_0111_1_00000_00000 + | (q << 30) + | (imm5 << 16) | (machreg_to_vec(rn) << 5) | machreg_to_gpr(rd.to_reg()), ); diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 5898585285..8760766b3a 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -1799,9 +1799,41 @@ fn test_aarch64_binemit() { "mov v20.d[0], x21", )); insns.push(( - Inst::MovFromVec64 { + Inst::MovFromVec { + rd: writable_xreg(3), + rn: vreg(27), + idx: 14, + ty: I8, + }, + "633F1D0E", + "umov w3, v27.b[14]", + )); + insns.push(( + Inst::MovFromVec { + rd: writable_xreg(24), + rn: vreg(5), + idx: 3, + ty: I16, + }, + "B83C0E0E", + "umov w24, v5.h[3]", + )); + insns.push(( + Inst::MovFromVec { + rd: writable_xreg(12), + rn: vreg(17), + idx: 1, + ty: I32, + }, + "2C3E0C0E", + "mov w12, v17.s[1]", + )); + insns.push(( + Inst::MovFromVec { rd: writable_xreg(21), rn: vreg(20), + idx: 0, + ty: I64, }, "953E084E", "mov x21, v20.d[0]", @@ -2337,6 +2369,28 @@ fn test_aarch64_binemit() { "mov v17.16b, v26.16b", )); + insns.push(( + Inst::FpuMoveFromVec { + rd: writable_vreg(1), + rn: vreg(30), + idx: 2, + ty: F32, + }, + "C107145E", + "mov s1, v30.s[2]", + )); + + insns.push(( + Inst::FpuMoveFromVec { + rd: writable_vreg(23), + rn: vreg(11), + idx: 0, + ty: F64, + }, + "7705085E", + "mov d23, v11.d[0]", + )); + insns.push(( Inst::FpuRR { fpu_op: FPUOp1::Abs32, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index f5b6ecde25..635aaa99e5 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -543,6 +543,14 @@ pub enum Inst { rn: Reg, }, + /// Move to scalar from a vector element. + FpuMoveFromVec { + rd: Writable, + rn: Reg, + idx: u8, + ty: Type, + }, + /// 1-op FPU instruction. FpuRR { fpu_op: FPUOp1, @@ -679,10 +687,12 @@ pub enum Inst { rn: Reg, }, - /// Move to a GPR from a vector register. - MovFromVec64 { + /// Move to a GPR from a vector element. + MovFromVec { rd: Writable, rn: Reg, + idx: u8, + ty: Type, }, /// Vector extend. @@ -1149,6 +1159,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); collector.add_use(rn); } + &Inst::FpuMoveFromVec { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::FpuRR { rd, rn, .. } => { collector.add_def(rd); collector.add_use(rn); @@ -1229,7 +1243,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); collector.add_use(rn); } - &Inst::MovFromVec64 { rd, rn } => { + &Inst::MovFromVec { rd, rn, .. } => { collector.add_def(rd); collector.add_use(rn); } @@ -1606,6 +1620,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); map_use(mapper, rn); } + &mut Inst::FpuMoveFromVec { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::FpuRR { ref mut rd, ref mut rn, @@ -1774,9 +1796,10 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); map_use(mapper, rn); } - &mut Inst::MovFromVec64 { + &mut Inst::MovFromVec { ref mut rd, ref mut rn, + .. } => { map_def(mapper, rd); map_use(mapper, rn); @@ -2354,6 +2377,11 @@ impl ShowWithRRU for Inst { let rn = rn.show_rru(mb_rru); format!("mov {}.16b, {}.16b", rd, rn) } + &Inst::FpuMoveFromVec { rd, rn, idx, ty } => { + let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::from_ty(ty)); + let rn = show_vreg_element(rn, mb_rru, idx, ty); + format!("mov {}, {}", rd, rn) + } &Inst::FpuRR { fpu_op, rd, rn } => { let (op, sizesrc, sizedest) = match fpu_op { FPUOp1::Abs32 => ("fabs", InstSize::Size32, InstSize::Size32), @@ -2547,10 +2575,14 @@ impl ShowWithRRU for Inst { let rn = rn.show_rru(mb_rru); format!("mov {}.d[0], {}", rd, rn) } - &Inst::MovFromVec64 { rd, rn } => { - let rd = rd.to_reg().show_rru(mb_rru); - let rn = rn.show_rru(mb_rru); - format!("mov {}, {}.d[0]", rd, rn) + &Inst::MovFromVec { rd, rn, idx, ty } => { + let op = match ty { + I32 | I64 => "mov", + _ => "umov", + }; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, InstSize::from_ty(ty)); + let rn = show_vreg_element(rn, mb_rru, idx, ty); + format!("{} {}, {}", op, rd, rn) } &Inst::VecExtend { t, rd, rn } => { let (op, dest, src) = match t { diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs index 59841ed82c..9039fc09d4 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/regs.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs @@ -332,3 +332,22 @@ pub fn show_vreg_vector(reg: Reg, mb_rru: Option<&RealRegUniverse>, ty: Type) -> s } + +/// Show an indexed vector element. +pub fn show_vreg_element(reg: Reg, mb_rru: Option<&RealRegUniverse>, idx: u8, ty: Type) -> String { + assert_eq!(RegClass::V128, reg.get_class()); + let mut s = reg.show_rru(mb_rru); + + let suffix = match ty { + I8 => "b", + I16 => "h", + I32 => "s", + I64 => "d", + F32 => "s", + F64 => "d", + _ => unimplemented!(), + }; + + s.push_str(&format!(".{}[{}]", suffix, idx)); + s +} diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index a52b6fba02..a9d5c44268 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -99,9 +99,11 @@ pub(crate) fn lower_insn_to_regs>( alu_op, ty: I64, }); - ctx.emit(Inst::MovFromVec64 { + ctx.emit(Inst::MovFromVec { rd, rn: va.to_reg(), + idx: 0, + ty: I64, }); } @@ -131,9 +133,11 @@ pub(crate) fn lower_insn_to_regs>( alu_op, ty: I64, }); - ctx.emit(Inst::MovFromVec64 { + ctx.emit(Inst::MovFromVec { rd, rn: va.to_reg(), + idx: 0, + ty: I64, }); } @@ -1146,7 +1150,12 @@ pub(crate) fn lower_insn_to_regs>( } (true, false) => { let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); - ctx.emit(Inst::MovFromVec64 { rd, rn }); + ctx.emit(Inst::MovFromVec { + rd, + rn, + idx: 0, + ty: I64, + }); } } } @@ -1451,6 +1460,26 @@ pub(crate) fn lower_insn_to_regs>( ctx.emit(Inst::gen_move(rd, rm, ty)); } + Opcode::Extractlane => { + if let InstructionData::BinaryImm8 { imm, .. } = ctx.data(insn) { + let idx = *imm; + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ty.unwrap(); + + if ty_is_int(ty) { + ctx.emit(Inst::MovFromVec { rd, rn, idx, ty }); + // Plain moves are faster on some processors. + } else if idx == 0 { + ctx.emit(Inst::gen_move(rd, rn, ty)); + } else { + ctx.emit(Inst::FpuMoveFromVec { rd, rn, idx, ty }); + } + } else { + unreachable!(); + } + } + Opcode::Shuffle | Opcode::Vsplit | Opcode::Vconcat @@ -1459,7 +1488,6 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::VallTrue | Opcode::Splat | Opcode::Insertlane - | Opcode::Extractlane | Opcode::ScalarToVector | Opcode::Swizzle | Opcode::Uload8x8Complex