CL/aarch64: implement the wasm SIMD pseudo-max/min and FP-rounding instructions
This patch implements, for aarch64, the following wasm SIMD extensions Floating-point rounding instructions https://github.com/WebAssembly/simd/pull/232 Pseudo-Minimum and Pseudo-Maximum instructions https://github.com/WebAssembly/simd/pull/122 The changes are straightforward: * `build.rs`: the relevant tests have been enabled * `cranelift/codegen/meta/src/shared/instructions.rs`: new CLIF instructions `fmin_pseudo` and `fmax_pseudo`. The wasm rounding instructions do not need any new CLIF instructions. * `cranelift/wasm/src/code_translator.rs`: translation into CLIF; this is pretty much the same as any other unary or binary vector instruction (for the rounding and the pmin/max respectively) * `cranelift/codegen/src/isa/aarch64/lower_inst.rs`: - `fmin_pseudo` and `fmax_pseudo` are converted into a two instruction sequence, `fcmpgt` followed by `bsl` - the CLIF rounding instructions are converted to a suitable vector `frint{n,z,p,m}` instruction. * `cranelift/codegen/src/isa/aarch64/inst/mod.rs`: minor extension of `pub enum VecMisc2` to handle the rounding operations. And corresponding `emit` cases.
This commit is contained in:
committed by
julian-seward1
parent
fc1cedb2ff
commit
c15d9bd61b
@@ -1430,6 +1430,22 @@ impl MachInstEmit for Inst {
|
||||
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
|
||||
(0b1, 0b11101, enc_size & 0b1)
|
||||
}
|
||||
VecMisc2::Frintn => {
|
||||
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
|
||||
(0b0, 0b11000, enc_size & 0b01)
|
||||
}
|
||||
VecMisc2::Frintz => {
|
||||
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
|
||||
(0b0, 0b11001, enc_size | 0b10)
|
||||
}
|
||||
VecMisc2::Frintm => {
|
||||
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
|
||||
(0b0, 0b11001, enc_size & 0b01)
|
||||
}
|
||||
VecMisc2::Frintp => {
|
||||
debug_assert!(size == VectorSize::Size32x4 || size == VectorSize::Size64x2);
|
||||
(0b0, 0b11000, enc_size | 0b10)
|
||||
}
|
||||
};
|
||||
sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
|
||||
}
|
||||
|
||||
@@ -3476,6 +3476,94 @@ fn test_aarch64_binemit() {
|
||||
"ucvtf v10.2d, v19.2d",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Frintn,
|
||||
rd: writable_vreg(11),
|
||||
rn: vreg(18),
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"4B8A214E",
|
||||
"frintn v11.4s, v18.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Frintn,
|
||||
rd: writable_vreg(12),
|
||||
rn: vreg(17),
|
||||
size: VectorSize::Size64x2,
|
||||
},
|
||||
"2C8A614E",
|
||||
"frintn v12.2d, v17.2d",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Frintz,
|
||||
rd: writable_vreg(11),
|
||||
rn: vreg(18),
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"4B9AA14E",
|
||||
"frintz v11.4s, v18.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Frintz,
|
||||
rd: writable_vreg(12),
|
||||
rn: vreg(17),
|
||||
size: VectorSize::Size64x2,
|
||||
},
|
||||
"2C9AE14E",
|
||||
"frintz v12.2d, v17.2d",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Frintm,
|
||||
rd: writable_vreg(11),
|
||||
rn: vreg(18),
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"4B9A214E",
|
||||
"frintm v11.4s, v18.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Frintm,
|
||||
rd: writable_vreg(12),
|
||||
rn: vreg(17),
|
||||
size: VectorSize::Size64x2,
|
||||
},
|
||||
"2C9A614E",
|
||||
"frintm v12.2d, v17.2d",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Frintp,
|
||||
rd: writable_vreg(11),
|
||||
rn: vreg(18),
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"4B8AA14E",
|
||||
"frintp v11.4s, v18.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Frintp,
|
||||
rd: writable_vreg(12),
|
||||
rn: vreg(17),
|
||||
size: VectorSize::Size64x2,
|
||||
},
|
||||
"2C8AE14E",
|
||||
"frintp v12.2d, v17.2d",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecLanes {
|
||||
op: VecLanesOp::Uminv,
|
||||
|
||||
@@ -319,6 +319,14 @@ pub enum VecMisc2 {
|
||||
Scvtf,
|
||||
/// Unsigned integer convert to floating-point
|
||||
Ucvtf,
|
||||
/// Floating point round to integral, rounding towards nearest
|
||||
Frintn,
|
||||
/// Floating point round to integral, rounding towards zero
|
||||
Frintz,
|
||||
/// Floating point round to integral, rounding towards minus infinity
|
||||
Frintm,
|
||||
/// Floating point round to integral, rounding towards plus infinity
|
||||
Frintp,
|
||||
}
|
||||
|
||||
/// A Vector narrowing operation with two registers.
|
||||
@@ -3436,6 +3444,10 @@ impl Inst {
|
||||
VecMisc2::Fcvtzu => ("fcvtzu", size),
|
||||
VecMisc2::Scvtf => ("scvtf", size),
|
||||
VecMisc2::Ucvtf => ("ucvtf", size),
|
||||
VecMisc2::Frintn => ("frintn", size),
|
||||
VecMisc2::Frintz => ("frintz", size),
|
||||
VecMisc2::Frintm => ("frintm", size),
|
||||
VecMisc2::Frintp => ("frintp", size),
|
||||
};
|
||||
|
||||
let rd_size = if is_shll { size.widen() } else { size };
|
||||
|
||||
@@ -2373,6 +2373,43 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::FminPseudo | Opcode::FmaxPseudo => {
|
||||
let ty = ctx.input_ty(insn, 0);
|
||||
if ty == F32X4 || ty == F64X2 {
|
||||
// pmin(a,b) => bitsel(b, a, cmpgt(a, b))
|
||||
// pmax(a,b) => bitsel(b, a, cmpgt(b, a))
|
||||
let r_dst = get_output_reg(ctx, outputs[0]);
|
||||
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||
// Since we're going to write the output register `r_dst` anyway, we might as
|
||||
// well first use it to hold the comparison result. This has the slightly unusual
|
||||
// effect that we modify the output register in the first instruction (`fcmgt`)
|
||||
// but read both the inputs again in the second instruction (`bsl`), which means
|
||||
// that the output register can't be either of the input registers. Regalloc
|
||||
// should handle this correctly, nevertheless.
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Fcmgt,
|
||||
rd: r_dst,
|
||||
rn: if op == Opcode::FminPseudo { r_a } else { r_b },
|
||||
rm: if op == Opcode::FminPseudo { r_b } else { r_a },
|
||||
size: if ty == F32X4 {
|
||||
VectorSize::Size32x4
|
||||
} else {
|
||||
VectorSize::Size64x2
|
||||
},
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Bsl,
|
||||
rd: r_dst,
|
||||
rn: r_b,
|
||||
rm: r_a,
|
||||
size: VectorSize::Size8x16,
|
||||
});
|
||||
} else {
|
||||
panic!("Opcode::FminPseudo | Opcode::FmaxPseudo: unhandled type");
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
|
||||
let ty = ty.unwrap();
|
||||
let bits = ty_bits(ty);
|
||||
@@ -2411,21 +2448,39 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
}
|
||||
|
||||
Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
|
||||
let bits = ty_bits(ctx.output_ty(insn, 0));
|
||||
let op = match (op, bits) {
|
||||
(Opcode::Ceil, 32) => FpuRoundMode::Plus32,
|
||||
(Opcode::Ceil, 64) => FpuRoundMode::Plus64,
|
||||
(Opcode::Floor, 32) => FpuRoundMode::Minus32,
|
||||
(Opcode::Floor, 64) => FpuRoundMode::Minus64,
|
||||
(Opcode::Trunc, 32) => FpuRoundMode::Zero32,
|
||||
(Opcode::Trunc, 64) => FpuRoundMode::Zero64,
|
||||
(Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
|
||||
(Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
|
||||
_ => panic!("Unknown op/bits combination"),
|
||||
};
|
||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
ctx.emit(Inst::FpuRound { op, rd, rn });
|
||||
let ty = ctx.output_ty(insn, 0);
|
||||
if !ty.is_vector() {
|
||||
let bits = ty_bits(ty);
|
||||
let op = match (op, bits) {
|
||||
(Opcode::Ceil, 32) => FpuRoundMode::Plus32,
|
||||
(Opcode::Ceil, 64) => FpuRoundMode::Plus64,
|
||||
(Opcode::Floor, 32) => FpuRoundMode::Minus32,
|
||||
(Opcode::Floor, 64) => FpuRoundMode::Minus64,
|
||||
(Opcode::Trunc, 32) => FpuRoundMode::Zero32,
|
||||
(Opcode::Trunc, 64) => FpuRoundMode::Zero64,
|
||||
(Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
|
||||
(Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
|
||||
_ => panic!("Unknown op/bits combination (scalar)"),
|
||||
};
|
||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
ctx.emit(Inst::FpuRound { op, rd, rn });
|
||||
} else {
|
||||
let (op, size) = match (op, ty) {
|
||||
(Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
|
||||
(Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
|
||||
(Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
|
||||
(Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
|
||||
(Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
|
||||
(Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
|
||||
(Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
|
||||
(Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
|
||||
_ => panic!("Unknown op/ty combination (vector){:?}", ty),
|
||||
};
|
||||
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
ctx.emit(Inst::VecMisc { op, rd, rn, size });
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Fma => {
|
||||
|
||||
Reference in New Issue
Block a user