CL/aarch64: implement the wasm SIMD v128.load{32,64}_zero instructions.
This patch implements, for aarch64, the following wasm SIMD extensions. v128.load32_zero and v128.load64_zero instructions https://github.com/WebAssembly/simd/pull/237 The changes are straightforward: * no new CLIF instructions. They are translated into an existing CLIF scalar load followed by a CLIF `scalar_to_vector`. * the comment/specification for CLIF `scalar_to_vector` has been changed to match the actual intended semantics, per consulation with Andrew Brown. * translation from `scalar_to_vector` to aarch64 `fmov` instruction. This has been generalised slightly so as to allow both 32- and 64-bit transfers. * special-case zero in `lower_constant_f128` in order to avoid a potentially slow call to `Inst::load_fp_constant128`. * Once "Allow loads to merge into other operations during instruction selection in MachInst backends" (https://github.com/bytecodealliance/wasmtime/issues/2340) lands, we can use that functionality to pattern match the two-CLIF pair and emit a single AArch64 instruction. * A simple filetest has been added. There is no comprehensive testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
This commit is contained in:
committed by
julian-seward1
parent
285edeec3e
commit
dd9bfcefaa
@@ -3798,12 +3798,9 @@ pub(crate) fn define(
|
|||||||
Inst::new(
|
Inst::new(
|
||||||
"scalar_to_vector",
|
"scalar_to_vector",
|
||||||
r#"
|
r#"
|
||||||
Scalar To Vector -- move a value out of a scalar register and into a vector register; the
|
Copies a scalar value to a vector value. The scalar is copied into the
|
||||||
scalar will be moved to the lowest-order bits of the vector register. Note that this
|
least significant lane of the vector, and all other lanes will be zero.
|
||||||
instruction is intended as a low-level legalization instruction and frontends should prefer
|
"#,
|
||||||
insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some
|
|
||||||
types (e.g. integers) but not for others (e.g. floats).
|
|
||||||
"#,
|
|
||||||
&formats.unary,
|
&formats.unary,
|
||||||
)
|
)
|
||||||
.operands_in(vec![s])
|
.operands_in(vec![s])
|
||||||
|
|||||||
@@ -579,6 +579,15 @@ impl ScalarSize {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert to an integer operand size.
|
||||||
|
pub fn operand_size(&self) -> OperandSize {
|
||||||
|
match self {
|
||||||
|
ScalarSize::Size32 => OperandSize::Size32,
|
||||||
|
ScalarSize::Size64 => OperandSize::Size64,
|
||||||
|
_ => panic!("Unexpected operand_size request for: {:?}", self),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert from a type into the smallest size that fits.
|
/// Convert from a type into the smallest size that fits.
|
||||||
pub fn from_ty(ty: Type) -> ScalarSize {
|
pub fn from_ty(ty: Type) -> ScalarSize {
|
||||||
Self::from_bits(ty_bits(ty))
|
Self::from_bits(ty_bits(ty))
|
||||||
|
|||||||
@@ -1651,12 +1651,13 @@ impl MachInstEmit for Inst {
|
|||||||
};
|
};
|
||||||
sink.put4(enc_fround(top22, rd, rn));
|
sink.put4(enc_fround(top22, rd, rn));
|
||||||
}
|
}
|
||||||
&Inst::MovToFpu { rd, rn } => {
|
&Inst::MovToFpu { rd, rn, size } => {
|
||||||
sink.put4(
|
let template = match size {
|
||||||
0b100_11110_01_1_00_111_000000_00000_00000
|
ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000,
|
||||||
| (machreg_to_gpr(rn) << 5)
|
ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000,
|
||||||
| machreg_to_vec(rd.to_reg()),
|
_ => unreachable!(),
|
||||||
);
|
};
|
||||||
|
sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
|
||||||
}
|
}
|
||||||
&Inst::MovToVec { rd, rn, idx, size } => {
|
&Inst::MovToVec { rd, rn, idx, size } => {
|
||||||
let (imm5, shift) = match size.lane_size() {
|
let (imm5, shift) = match size.lane_size() {
|
||||||
|
|||||||
@@ -1860,10 +1860,20 @@ fn test_aarch64_binemit() {
|
|||||||
Inst::MovToFpu {
|
Inst::MovToFpu {
|
||||||
rd: writable_vreg(31),
|
rd: writable_vreg(31),
|
||||||
rn: xreg(0),
|
rn: xreg(0),
|
||||||
|
size: ScalarSize::Size64,
|
||||||
},
|
},
|
||||||
"1F00679E",
|
"1F00679E",
|
||||||
"fmov d31, x0",
|
"fmov d31, x0",
|
||||||
));
|
));
|
||||||
|
insns.push((
|
||||||
|
Inst::MovToFpu {
|
||||||
|
rd: writable_vreg(1),
|
||||||
|
rn: xreg(28),
|
||||||
|
size: ScalarSize::Size32,
|
||||||
|
},
|
||||||
|
"8103271E",
|
||||||
|
"fmov s1, w28",
|
||||||
|
));
|
||||||
insns.push((
|
insns.push((
|
||||||
Inst::MovToVec {
|
Inst::MovToVec {
|
||||||
rd: writable_vreg(0),
|
rd: writable_vreg(0),
|
||||||
|
|||||||
@@ -877,10 +877,13 @@ pub enum Inst {
|
|||||||
rn: Reg,
|
rn: Reg,
|
||||||
},
|
},
|
||||||
|
|
||||||
/// Move from a GPR to a scalar FP register.
|
/// Move from a GPR to a vector register. The scalar value is parked in the lowest lane
|
||||||
|
/// of the destination, and all other lanes are zeroed out. Currently only 32- and 64-bit
|
||||||
|
/// transactions are supported.
|
||||||
MovToFpu {
|
MovToFpu {
|
||||||
rd: Writable<Reg>,
|
rd: Writable<Reg>,
|
||||||
rn: Reg,
|
rn: Reg,
|
||||||
|
size: ScalarSize,
|
||||||
},
|
},
|
||||||
|
|
||||||
/// Move to a vector element from a GPR.
|
/// Move to a vector element from a GPR.
|
||||||
@@ -1319,13 +1322,15 @@ impl Inst {
|
|||||||
size: VectorSize::Size8x8
|
size: VectorSize::Size8x8
|
||||||
}]
|
}]
|
||||||
} else {
|
} else {
|
||||||
// TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits.
|
// TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent
|
||||||
|
// bits.
|
||||||
let tmp = alloc_tmp(RegClass::I64, I32);
|
let tmp = alloc_tmp(RegClass::I64, I32);
|
||||||
let mut insts = Inst::load_constant(tmp, value as u64);
|
let mut insts = Inst::load_constant(tmp, value as u64);
|
||||||
|
|
||||||
insts.push(Inst::MovToFpu {
|
insts.push(Inst::MovToFpu {
|
||||||
rd,
|
rd,
|
||||||
rn: tmp.to_reg(),
|
rn: tmp.to_reg(),
|
||||||
|
size: ScalarSize::Size64,
|
||||||
});
|
});
|
||||||
|
|
||||||
insts
|
insts
|
||||||
@@ -1340,9 +1345,9 @@ impl Inst {
|
|||||||
) -> SmallVec<[Inst; 4]> {
|
) -> SmallVec<[Inst; 4]> {
|
||||||
if let Ok(const_data) = u32::try_from(const_data) {
|
if let Ok(const_data) = u32::try_from(const_data) {
|
||||||
Inst::load_fp_constant32(rd, const_data, alloc_tmp)
|
Inst::load_fp_constant32(rd, const_data, alloc_tmp)
|
||||||
// TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent bits.
|
// TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent
|
||||||
// Also, treat it as half of a 128-bit vector and consider replicated patterns. Scalar MOVI
|
// bits. Also, treat it as half of a 128-bit vector and consider replicated
|
||||||
// might also be an option.
|
// patterns. Scalar MOVI might also be an option.
|
||||||
} else if const_data & (u32::MAX as u64) == 0 {
|
} else if const_data & (u32::MAX as u64) == 0 {
|
||||||
let tmp = alloc_tmp(RegClass::I64, I64);
|
let tmp = alloc_tmp(RegClass::I64, I64);
|
||||||
let mut insts = Inst::load_constant(tmp, const_data);
|
let mut insts = Inst::load_constant(tmp, const_data);
|
||||||
@@ -1350,6 +1355,7 @@ impl Inst {
|
|||||||
insts.push(Inst::MovToFpu {
|
insts.push(Inst::MovToFpu {
|
||||||
rd,
|
rd,
|
||||||
rn: tmp.to_reg(),
|
rn: tmp.to_reg(),
|
||||||
|
size: ScalarSize::Size64,
|
||||||
});
|
});
|
||||||
|
|
||||||
insts
|
insts
|
||||||
@@ -1849,7 +1855,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
|||||||
collector.add_def(rd);
|
collector.add_def(rd);
|
||||||
collector.add_use(rn);
|
collector.add_use(rn);
|
||||||
}
|
}
|
||||||
&Inst::MovToFpu { rd, rn } => {
|
&Inst::MovToFpu { rd, rn, .. } => {
|
||||||
collector.add_def(rd);
|
collector.add_def(rd);
|
||||||
collector.add_use(rn);
|
collector.add_use(rn);
|
||||||
}
|
}
|
||||||
@@ -2527,6 +2533,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
|||||||
&mut Inst::MovToFpu {
|
&mut Inst::MovToFpu {
|
||||||
ref mut rd,
|
ref mut rd,
|
||||||
ref mut rn,
|
ref mut rn,
|
||||||
|
..
|
||||||
} => {
|
} => {
|
||||||
map_def(mapper, rd);
|
map_def(mapper, rd);
|
||||||
map_use(mapper, rn);
|
map_use(mapper, rn);
|
||||||
@@ -3406,9 +3413,10 @@ impl Inst {
|
|||||||
let rn = show_vreg_scalar(rn, mb_rru, size);
|
let rn = show_vreg_scalar(rn, mb_rru, size);
|
||||||
format!("{} {}, {}", inst, rd, rn)
|
format!("{} {}, {}", inst, rd, rn)
|
||||||
}
|
}
|
||||||
&Inst::MovToFpu { rd, rn } => {
|
&Inst::MovToFpu { rd, rn, size } => {
|
||||||
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
|
let operand_size = size.operand_size();
|
||||||
let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size64);
|
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
|
||||||
|
let rn = show_ireg_sized(rn, mb_rru, operand_size);
|
||||||
format!("fmov {}, {}", rd, rn)
|
format!("fmov {}, {}", rd, rn)
|
||||||
}
|
}
|
||||||
&Inst::MovToVec { rd, rn, idx, size } => {
|
&Inst::MovToVec { rd, rn, idx, size } => {
|
||||||
|
|||||||
@@ -837,10 +837,20 @@ pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
|
|||||||
rd: Writable<Reg>,
|
rd: Writable<Reg>,
|
||||||
value: u128,
|
value: u128,
|
||||||
) {
|
) {
|
||||||
let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
|
if value == 0 {
|
||||||
|
// Fast-track a common case. The general case, viz, calling `Inst::load_fp_constant128`,
|
||||||
for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
|
// is potentially expensive.
|
||||||
ctx.emit(inst);
|
ctx.emit(Inst::VecDupImm {
|
||||||
|
rd,
|
||||||
|
imm: ASIMDMovModImm::zero(),
|
||||||
|
invert: false,
|
||||||
|
size: VectorSize::Size8x16,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
|
||||||
|
for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
|
||||||
|
ctx.emit(inst);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -179,8 +179,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
let vb = ctx.alloc_tmp(RegClass::V128, I128);
|
let vb = ctx.alloc_tmp(RegClass::V128, I128);
|
||||||
let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
|
||||||
let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
|
let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
|
||||||
ctx.emit(Inst::MovToFpu { rd: va, rn: ra });
|
ctx.emit(Inst::MovToFpu {
|
||||||
ctx.emit(Inst::MovToFpu { rd: vb, rn: rb });
|
rd: va,
|
||||||
|
rn: ra,
|
||||||
|
size: ScalarSize::Size64,
|
||||||
|
});
|
||||||
|
ctx.emit(Inst::MovToFpu {
|
||||||
|
rd: vb,
|
||||||
|
rn: rb,
|
||||||
|
size: ScalarSize::Size64,
|
||||||
|
});
|
||||||
ctx.emit(Inst::FpuRRR {
|
ctx.emit(Inst::FpuRRR {
|
||||||
fpu_op,
|
fpu_op,
|
||||||
rd: va,
|
rd: va,
|
||||||
@@ -1703,7 +1711,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
(false, true) => {
|
(false, true) => {
|
||||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
|
||||||
ctx.emit(Inst::MovToFpu { rd, rn });
|
ctx.emit(Inst::MovToFpu {
|
||||||
|
rd,
|
||||||
|
rn,
|
||||||
|
size: ScalarSize::Size64,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
(true, false) => {
|
(true, false) => {
|
||||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||||
@@ -2056,6 +2068,26 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Opcode::ScalarToVector => {
|
||||||
|
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||||
|
let rd = get_output_reg(ctx, outputs[0]);
|
||||||
|
let input_ty = ctx.input_ty(insn, 0);
|
||||||
|
if (input_ty == I32 && ty.unwrap() == I32X4)
|
||||||
|
|| (input_ty == I64 && ty.unwrap() == I64X2)
|
||||||
|
{
|
||||||
|
ctx.emit(Inst::MovToFpu {
|
||||||
|
rd,
|
||||||
|
rn,
|
||||||
|
size: ScalarSize::from_ty(input_ty),
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
return Err(CodegenError::Unsupported(format!(
|
||||||
|
"ScalarToVector: unsupported types {:?} -> {:?}",
|
||||||
|
input_ty, ty
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Opcode::VanyTrue | Opcode::VallTrue => {
|
Opcode::VanyTrue | Opcode::VallTrue => {
|
||||||
let rd = get_output_reg(ctx, outputs[0]);
|
let rd = get_output_reg(ctx, outputs[0]);
|
||||||
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||||
@@ -2341,7 +2373,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
|
|
||||||
Opcode::Vsplit
|
Opcode::Vsplit
|
||||||
| Opcode::Vconcat
|
| Opcode::Vconcat
|
||||||
| Opcode::ScalarToVector
|
|
||||||
| Opcode::Uload8x8Complex
|
| Opcode::Uload8x8Complex
|
||||||
| Opcode::Sload8x8Complex
|
| Opcode::Sload8x8Complex
|
||||||
| Opcode::Uload16x4Complex
|
| Opcode::Uload16x4Complex
|
||||||
|
|||||||
@@ -0,0 +1,33 @@
|
|||||||
|
test compile
|
||||||
|
target aarch64
|
||||||
|
|
||||||
|
function %f1() -> i64x2 {
|
||||||
|
block0:
|
||||||
|
v0 = iconst.i64 281474976710657
|
||||||
|
v1 = scalar_to_vector.i64x2 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: movz x0, #1
|
||||||
|
; nextln: movk x0, #1, LSL #48
|
||||||
|
; nextln: fmov d0, x0
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f2() -> i32x4 {
|
||||||
|
block0:
|
||||||
|
v0 = iconst.i32 42679
|
||||||
|
v1 = scalar_to_vector.i32x4 v0
|
||||||
|
return v1
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: movz x0, #42679
|
||||||
|
; nextln: fmov s0, w0
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
@@ -1426,6 +1426,18 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
|
let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
|
||||||
state.push1(dfg.first_result(load))
|
state.push1(dfg.first_result(load))
|
||||||
}
|
}
|
||||||
|
Operator::V128Load32Zero { memarg } | Operator::V128Load64Zero { memarg } => {
|
||||||
|
translate_load(
|
||||||
|
memarg,
|
||||||
|
ir::Opcode::Load,
|
||||||
|
type_of(op).lane_type(),
|
||||||
|
builder,
|
||||||
|
state,
|
||||||
|
environ,
|
||||||
|
)?;
|
||||||
|
let as_vector = builder.ins().scalar_to_vector(type_of(op), state.pop1());
|
||||||
|
state.push1(as_vector)
|
||||||
|
}
|
||||||
Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
|
Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
|
||||||
let vector = pop1_with_bitcast(state, type_of(op), builder);
|
let vector = pop1_with_bitcast(state, type_of(op), builder);
|
||||||
let extracted = builder.ins().extractlane(vector, lane.clone());
|
let extracted = builder.ins().extractlane(vector, lane.clone());
|
||||||
@@ -1790,10 +1802,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
|
Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
|
||||||
return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
|
return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
|
||||||
}
|
}
|
||||||
|
|
||||||
Operator::V128Load32Zero { .. } | Operator::V128Load64Zero { .. } => {
|
|
||||||
return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -2516,7 +2524,8 @@ fn type_of(operator: &Operator) -> Type {
|
|||||||
| Operator::I32x4MaxU
|
| Operator::I32x4MaxU
|
||||||
| Operator::F32x4ConvertI32x4S
|
| Operator::F32x4ConvertI32x4S
|
||||||
| Operator::F32x4ConvertI32x4U
|
| Operator::F32x4ConvertI32x4U
|
||||||
| Operator::I32x4Bitmask => I32X4,
|
| Operator::I32x4Bitmask
|
||||||
|
| Operator::V128Load32Zero { .. } => I32X4,
|
||||||
|
|
||||||
Operator::I64x2Splat
|
Operator::I64x2Splat
|
||||||
| Operator::V128Load64Splat { .. }
|
| Operator::V128Load64Splat { .. }
|
||||||
@@ -2528,7 +2537,8 @@ fn type_of(operator: &Operator) -> Type {
|
|||||||
| Operator::I64x2ShrU
|
| Operator::I64x2ShrU
|
||||||
| Operator::I64x2Add
|
| Operator::I64x2Add
|
||||||
| Operator::I64x2Sub
|
| Operator::I64x2Sub
|
||||||
| Operator::I64x2Mul => I64X2,
|
| Operator::I64x2Mul
|
||||||
|
| Operator::V128Load64Zero { .. } => I64X2,
|
||||||
|
|
||||||
Operator::F32x4Splat
|
Operator::F32x4Splat
|
||||||
| Operator::F32x4ExtractLane { .. }
|
| Operator::F32x4ExtractLane { .. }
|
||||||
|
|||||||
Reference in New Issue
Block a user