diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index de78c3b3b7..681b3104d5 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -396,6 +396,7 @@ fn define_simd( let insertlane = insts.by_name("insertlane"); let ishl = insts.by_name("ishl"); let ishl_imm = insts.by_name("ishl_imm"); + let load_splat = insts.by_name("load_splat"); let raw_bitcast = insts.by_name("raw_bitcast"); let scalar_to_vector = insts.by_name("scalar_to_vector"); let splat = insts.by_name("splat"); @@ -820,6 +821,7 @@ fn define_simd( narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector"); narrow.custom_legalize(fmin, "expand_minmax_vector"); narrow.custom_legalize(fmax, "expand_minmax_vector"); + narrow.custom_legalize(load_splat, "expand_load_splat"); narrow_avx.custom_legalize(imul, "convert_i64x2_imul"); narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector"); diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 2c16734f27..9cb77493c7 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -4409,5 +4409,24 @@ pub(crate) fn define( .other_side_effects(true), ); + let Offset = &Operand::new("Offset", &imm.offset32).with_doc("Byte offset from base address"); + let a = &Operand::new("a", TxN); + + ig.push( + Inst::new( + "load_splat", + r#" + Load an element from memory at ``p + Offset`` and return a vector + whose lanes are all set to that element. + + This is equivalent to ``load`` followed by ``splat``. + "#, + &formats.load, + ) + .operands_in(vec![MemFlags, p, Offset]) + .operands_out(vec![a]) + .can_load(true), + ); + ig.build() } diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index f85c1028ff..95bf4bb63f 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -680,4 +680,19 @@ impl VectorSize { _ => *self, } } + + /// Return the encoding bits that are used by some SIMD instructions + /// for a particular operand size. + pub fn enc_size(&self) -> (u32, u32) { + let q = self.is_128bits() as u32; + let size = match self.lane_size() { + ScalarSize::Size8 => 0b00, + ScalarSize::Size16 => 0b01, + ScalarSize::Size32 => 0b10, + ScalarSize::Size64 => 0b11, + _ => unreachable!(), + }; + + (q, size) + } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index d422fdc24f..124fd36c87 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 { (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd) } +fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable) -> u32 { + debug_assert_eq!(q & 0b1, q); + debug_assert_eq!(size & 0b11, size); + 0b0_0_0011010_10_00000_110_0_00_00000_00000 + | q << 30 + | size << 10 + | machreg_to_gpr(rn) << 5 + | machreg_to_vec(rt.to_reg()) +} + fn enc_extend(top22: u32, rd: Writable, rn: Reg) -> u32 { (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()) } @@ -1380,14 +1390,7 @@ impl MachInstEmit for Inst { sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); } &Inst::VecMisc { op, rd, rn, size } => { - let enc_size = match size.lane_size() { - ScalarSize::Size8 => 0b00, - ScalarSize::Size16 => 0b01, - ScalarSize::Size32 => 0b10, - ScalarSize::Size64 => 0b11, - _ => unreachable!(), - }; - let q = if size.is_128bits() { 1 } else { 0 }; + let (q, enc_size) = size.enc_size(); let (u, bits_12_16, size) = match op { VecMisc2::Not => (0b1, 0b00101, 0b00), VecMisc2::Neg => (0b1, 0b01011, enc_size), @@ -1756,13 +1759,7 @@ impl MachInstEmit for Inst { alu_op, size, } => { - let enc_size = match size.lane_size() { - ScalarSize::Size8 => 0b00, - ScalarSize::Size16 => 0b01, - ScalarSize::Size32 => 0b10, - ScalarSize::Size64 => 0b11, - _ => unreachable!(), - }; + let (q, enc_size) = size.enc_size(); let is_float = match alu_op { VecALUOp::Fcmeq | VecALUOp::Fcmgt @@ -1776,6 +1773,7 @@ impl MachInstEmit for Inst { _ => false, }; let enc_float_size = match (is_float, size) { + (true, VectorSize::Size32x2) => 0b0, (true, VectorSize::Size32x4) => 0b0, (true, VectorSize::Size64x2) => 0b1, (true, _) => unimplemented!(), @@ -1783,58 +1781,73 @@ impl MachInstEmit for Inst { }; let (top11, bit15_10) = match alu_op { - VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011), - VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011), - VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011), - VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011), - VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011), - VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111), - VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101), - VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101), - VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111), - VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001), - VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001), - VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001), + VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011), + VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011), + VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011), + VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011), + VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011), + VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111), + VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101), + VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101), + VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111), + VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001), + VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001), + VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001), // The following logical instructions operate on bytes, so are not encoded differently // for the different vector types. - VecALUOp::And => (0b010_01110_00_1, 0b000111), - VecALUOp::Bic => (0b010_01110_01_1, 0b000111), - VecALUOp::Orr => (0b010_01110_10_1, 0b000111), - VecALUOp::Eor => (0b011_01110_00_1, 0b000111), - VecALUOp::Bsl => (0b011_01110_01_1, 0b000111), - VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001), - VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001), - VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001), + VecALUOp::And => (0b000_01110_00_1, 0b000111), + VecALUOp::Bic => (0b000_01110_01_1, 0b000111), + VecALUOp::Orr => (0b000_01110_10_1, 0b000111), + VecALUOp::Eor => (0b001_01110_00_1, 0b000111), + VecALUOp::Bsl => (0b001_01110_01_1, 0b000111), + VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001), + VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001), + VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001), VecALUOp::Mul => { debug_assert_ne!(size, VectorSize::Size64x2); - (0b010_01110_00_1 | enc_size << 1, 0b100111) + (0b000_01110_00_1 | enc_size << 1, 0b100111) } - VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001), - VecALUOp::Ushl => (0b011_01110_00_1 | enc_size << 1, 0b010001), - VecALUOp::Umin => (0b011_01110_00_1 | enc_size << 1, 0b011011), - VecALUOp::Smin => (0b010_01110_00_1 | enc_size << 1, 0b011011), - VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001), - VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001), - VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101), - VecALUOp::Fadd => (0b010_01110_00_1, 0b110101), - VecALUOp::Fsub => (0b010_01110_10_1, 0b110101), - VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111), - VecALUOp::Fmax => (0b010_01110_00_1, 0b111101), - VecALUOp::Fmin => (0b010_01110_10_1, 0b111101), - VecALUOp::Fmul => (0b011_01110_00_1, 0b110111), - VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111), + VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001), + VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001), + VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011), + VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011), + VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001), + VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001), + VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101), + VecALUOp::Fadd => (0b000_01110_00_1, 0b110101), + VecALUOp::Fsub => (0b000_01110_10_1, 0b110101), + VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111), + VecALUOp::Fmax => (0b000_01110_00_1, 0b111101), + VecALUOp::Fmin => (0b000_01110_10_1, 0b111101), + VecALUOp::Fmul => (0b001_01110_00_1, 0b110111), + VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111), VecALUOp::Umlal => { debug_assert!(!size.is_128bits()); (0b001_01110_00_1 | enc_size << 1, 0b100000) } }; let top11 = if is_float { - top11 | enc_float_size << 1 + top11 | (q << 9) | enc_float_size << 1 } else { - top11 + top11 | (q << 9) }; sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); } + &Inst::VecLoadReplicate { + rd, + rn, + size, + srcloc, + } => { + let (q, size) = size.enc_size(); + + if let Some(srcloc) = srcloc { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + sink.put4(enc_ldst_vec(q, size, rn, rd)); + } &Inst::MovToNZCV { rn } => { sink.put4(0xd51b4200 | machreg_to_gpr(rn)); } @@ -2119,9 +2132,12 @@ impl MachInstEmit for Inst { inst.emit(sink, emit_info, state); } - let (reg, offset) = match mem { - AMode::Unscaled(r, simm9) => (r, simm9.value()), - AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32), + let (reg, index_reg, offset) = match mem { + AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0), + AMode::Unscaled(r, simm9) => (r, None, simm9.value()), + AMode::UnsignedOffset(r, uimm12scaled) => { + (r, None, uimm12scaled.value() as i32) + } _ => panic!("Unsupported case for LoadAddr: {:?}", mem), }; let abs_offset = if offset < 0 { @@ -2135,9 +2151,22 @@ impl MachInstEmit for Inst { ALUOp::Add64 }; - if offset == 0 { - let mov = Inst::mov(rd, reg); - mov.emit(sink, emit_info, state); + if let Some((idx, extendop)) = index_reg { + let add = Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd, + rn: reg, + rm: idx, + extendop, + }; + + add.emit(sink, emit_info, state); + } else if offset == 0 { + if reg != rd.to_reg() { + let mov = Inst::mov(rd, reg); + + mov.emit(sink, emit_info, state); + } } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) { let add = Inst::AluRRImm12 { alu_op, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 6d981c2eaa..48707610ff 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2533,10 +2533,10 @@ fn test_aarch64_binemit() { rd: writable_vreg(28), rn: vreg(12), rm: vreg(4), - size: VectorSize::Size32x4, + size: VectorSize::Size32x2, }, - "9CE5244E", - "fcmeq v28.4s, v12.4s, v4.4s", + "9CE5240E", + "fcmeq v28.2s, v12.2s, v4.2s", )); insns.push(( @@ -2965,10 +2965,10 @@ fn test_aarch64_binemit() { rd: writable_vreg(6), rn: vreg(9), rm: vreg(8), - size: VectorSize::Size8x16, + size: VectorSize::Size8x8, }, - "2665286E", - "umax v6.16b, v9.16b, v8.16b", + "2665282E", + "umax v6.8b, v9.8b, v8.8b", )); insns.push(( @@ -3507,6 +3507,28 @@ fn test_aarch64_binemit() { "tbx v3.16b, { v11.16b, v12.16b }, v19.16b", )); + insns.push(( + Inst::VecLoadReplicate { + rd: writable_vreg(31), + rn: xreg(0), + srcloc: None, + size: VectorSize::Size64x2, + }, + "1FCC404D", + "ld1r { v31.2d }, [x0]", + )); + + insns.push(( + Inst::VecLoadReplicate { + rd: writable_vreg(0), + rn: xreg(25), + srcloc: None, + size: VectorSize::Size8x8, + }, + "20C3400D", + "ld1r { v0.8b }, [x25]", + )); + insns.push(( Inst::Extend { rd: writable_xreg(1), diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 544d04c23c..e9c0f15425 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -975,6 +975,14 @@ pub enum Inst { is_extension: bool, }, + /// Load an element and replicate to all lanes of a vector. + VecLoadReplicate { + rd: Writable, + rn: Reg, + size: VectorSize, + srcloc: Option, + }, + /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn). MovToNZCV { rn: Reg, @@ -1609,7 +1617,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); } } - + &Inst::VecLoadReplicate { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => { collector.add_use(rn); collector.add_use(rm); @@ -1762,8 +1773,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { &Inst::LoadExtName { rd, .. } => { collector.add_def(rd); } - &Inst::LoadAddr { rd, mem: _ } => { + &Inst::LoadAddr { rd, ref mem } => { collector.add_def(rd); + memarg_regs(mem, collector); } &Inst::VirtualSPOffsetAdj { .. } => {} &Inst::EmitIsland { .. } => {} @@ -2189,6 +2201,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); } } + &mut Inst::VecLoadReplicate { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::FpuCmp32 { ref mut rn, ref mut rm, @@ -3412,6 +3432,12 @@ impl Inst { let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16); format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm) } + &Inst::VecLoadReplicate { rd, rn, size, .. } => { + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rn = rn.show_rru(mb_rru); + + format!("ld1r {{ {} }}, [{}]", rd, rn) + } &Inst::MovToNZCV { rn } => { let rn = rn.show_rru(mb_rru); format!("msr nzcv, {}", rn) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index fc28cb3581..ecdcb9c6d1 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs>( } } + Opcode::LoadSplat => { + let off = ctx.data(insn).load_store_offset().unwrap(); + let ty = ty.unwrap(); + let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off); + let memflags = ctx.memflags(insn).expect("memory flags"); + let rd = get_output_reg(ctx, outputs[0]); + let size = VectorSize::from_ty(ty); + let srcloc = if memflags.notrap() { + None + } else { + Some(ctx.srcloc(insn)) + }; + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + + ctx.emit(Inst::LoadAddr { rd: tmp, mem }); + ctx.emit(Inst::VecLoadReplicate { + rd, + rn: tmp.to_reg(), + size, + srcloc, + }); + } + Opcode::Store | Opcode::Istore8 | Opcode::Istore16 diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index 72890cffd9..976f1581e3 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -1892,3 +1892,31 @@ fn expand_tls_value( unreachable!(); } } + +fn expand_load_splat( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + + pos.use_srcloc(inst); + + let (ptr, offset, flags) = match pos.func.dfg[inst] { + ir::InstructionData::Load { + opcode: ir::Opcode::LoadSplat, + arg, + offset, + flags, + } => (arg, offset, flags), + _ => panic!( + "Expected load_splat: {}", + pos.func.dfg.display_inst(inst, None) + ), + }; + let ty = pos.func.dfg.ctrl_typevar(inst); + let load = pos.ins().load(ty.lane_type(), flags, ptr, offset); + + pos.func.dfg.replace(inst).splat(ty, load); +} diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 7c827802ba..ef1804cf12 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1380,19 +1380,17 @@ pub fn translate_operator( | Operator::V128Load16Splat { memarg } | Operator::V128Load32Splat { memarg } | Operator::V128Load64Splat { memarg } => { - // TODO: For spec compliance, this is initially implemented as a combination of `load + - // splat` but could be implemented eventually as a single instruction (`load_splat`). - // See https://github.com/bytecodealliance/wasmtime/issues/1175. - translate_load( + let opcode = ir::Opcode::LoadSplat; + let result_ty = type_of(op); + let (flags, base, offset) = prepare_load( memarg, - ir::Opcode::Load, - type_of(op).lane_type(), + mem_op_size(opcode, result_ty.lane_type()), builder, state, environ, )?; - let splatted = builder.ins().splat(type_of(op), state.pop1()); - state.push1(splatted) + let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base); + state.push1(dfg.first_result(load)) } Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => { let vector = pop1_with_bitcast(state, type_of(op), builder); @@ -2040,7 +2038,7 @@ fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 { ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1, ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2, ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4, - ir::Opcode::Store | ir::Opcode::Load => ty.bytes(), + ir::Opcode::Store | ir::Opcode::Load | ir::Opcode::LoadSplat => ty.bytes(), _ => panic!("unknown size of mem op for {:?}", opcode), } }