x64: add support for packed promote and demote (#2783)
* Add support for x64 packed promote low * Add support for x64 packed floating point demote * Update vector promote low and demote by adding constraints Also does some renaming and minor refactoring
This commit is contained in:
@@ -4223,6 +4223,69 @@ pub(crate) fn define(
|
|||||||
.constraints(vec![WiderOrEq(Float.clone(), FloatTo.clone())]),
|
.constraints(vec![WiderOrEq(Float.clone(), FloatTo.clone())]),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let F64x2 = &TypeVar::new(
|
||||||
|
"F64x2",
|
||||||
|
"A SIMD vector type consisting of 2 lanes of 64-bit floats",
|
||||||
|
TypeSetBuilder::new()
|
||||||
|
.floats(64..64)
|
||||||
|
.simd_lanes(2..2)
|
||||||
|
.includes_scalars(false)
|
||||||
|
.build(),
|
||||||
|
);
|
||||||
|
let F32x4 = &TypeVar::new(
|
||||||
|
"F32x4",
|
||||||
|
"A SIMD vector type consisting of 4 lanes of 32-bit floats",
|
||||||
|
TypeSetBuilder::new()
|
||||||
|
.floats(32..32)
|
||||||
|
.simd_lanes(4..4)
|
||||||
|
.includes_scalars(false)
|
||||||
|
.build(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let x = &Operand::new("x", F64x2);
|
||||||
|
let a = &Operand::new("a", F32x4);
|
||||||
|
|
||||||
|
ig.push(
|
||||||
|
Inst::new(
|
||||||
|
"fvdemote",
|
||||||
|
r#"
|
||||||
|
Convert `x` to a smaller floating point format.
|
||||||
|
|
||||||
|
Each lane in `x` is converted to the destination floating point format
|
||||||
|
by rounding to nearest, ties to even.
|
||||||
|
|
||||||
|
Cranelift currently only supports two floating point formats
|
||||||
|
- `f32` and `f64`. This may change in the future.
|
||||||
|
|
||||||
|
Fvdemote differs from fdemote in that with fvdemote it targets vectors.
|
||||||
|
Fvdemote is constrained to having the input type being F64x2 and the result
|
||||||
|
type being F32x4. The result lane that was the upper half of the input lane
|
||||||
|
is initialized to zero.
|
||||||
|
"#,
|
||||||
|
&formats.unary,
|
||||||
|
)
|
||||||
|
.operands_in(vec![x])
|
||||||
|
.operands_out(vec![a]),
|
||||||
|
);
|
||||||
|
|
||||||
|
ig.push(
|
||||||
|
Inst::new(
|
||||||
|
"fvpromote_low",
|
||||||
|
r#"
|
||||||
|
Converts packed single precision floating point to packed double precision floating point.
|
||||||
|
|
||||||
|
Considering only the lower half of the register, the low lanes in `x` are interpreted as
|
||||||
|
single precision floats that are then converted to a double precision floats.
|
||||||
|
|
||||||
|
The result type will have half the number of vector lanes as the input. Fvpromote_low is
|
||||||
|
constrained to input F32x4 with a result type of F64x2.
|
||||||
|
"#,
|
||||||
|
&formats.unary,
|
||||||
|
)
|
||||||
|
.operands_in(vec![a])
|
||||||
|
.operands_out(vec![x]),
|
||||||
|
);
|
||||||
|
|
||||||
let x = &Operand::new("x", Float);
|
let x = &Operand::new("x", Float);
|
||||||
let a = &Operand::new("a", IntTo);
|
let a = &Operand::new("a", IntTo);
|
||||||
|
|
||||||
|
|||||||
@@ -3193,6 +3193,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
|
|
||||||
Opcode::TlsValue => unimplemented!("tls_value"),
|
Opcode::TlsValue => unimplemented!("tls_value"),
|
||||||
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
|
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
|
||||||
|
Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
|
||||||
|
Opcode::Fvdemote => unimplemented!("Fvdemote"),
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -2548,7 +2548,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
| Opcode::SwidenHigh
|
| Opcode::SwidenHigh
|
||||||
| Opcode::UwidenLow
|
| Opcode::UwidenLow
|
||||||
| Opcode::UwidenHigh
|
| Opcode::UwidenHigh
|
||||||
| Opcode::WideningPairwiseDotProductS => {
|
| Opcode::WideningPairwiseDotProductS
|
||||||
|
| Opcode::FvpromoteLow
|
||||||
|
| Opcode::Fvdemote => {
|
||||||
// TODO
|
// TODO
|
||||||
panic!("Vector ops not implemented.");
|
panic!("Vector ops not implemented.");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -489,6 +489,8 @@ pub enum SseOpcode {
|
|||||||
Cmpsd,
|
Cmpsd,
|
||||||
Cvtdq2ps,
|
Cvtdq2ps,
|
||||||
Cvtdq2pd,
|
Cvtdq2pd,
|
||||||
|
Cvtpd2ps,
|
||||||
|
Cvtps2pd,
|
||||||
Cvtsd2ss,
|
Cvtsd2ss,
|
||||||
Cvtsd2si,
|
Cvtsd2si,
|
||||||
Cvtsi2ss,
|
Cvtsi2ss,
|
||||||
@@ -684,6 +686,8 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Comisd
|
| SseOpcode::Comisd
|
||||||
| SseOpcode::Cvtdq2ps
|
| SseOpcode::Cvtdq2ps
|
||||||
| SseOpcode::Cvtdq2pd
|
| SseOpcode::Cvtdq2pd
|
||||||
|
| SseOpcode::Cvtpd2ps
|
||||||
|
| SseOpcode::Cvtps2pd
|
||||||
| SseOpcode::Cvtsd2ss
|
| SseOpcode::Cvtsd2ss
|
||||||
| SseOpcode::Cvtsd2si
|
| SseOpcode::Cvtsd2si
|
||||||
| SseOpcode::Cvtsi2sd
|
| SseOpcode::Cvtsi2sd
|
||||||
@@ -843,6 +847,8 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Comisd => "comisd",
|
SseOpcode::Comisd => "comisd",
|
||||||
SseOpcode::Cvtdq2ps => "cvtdq2ps",
|
SseOpcode::Cvtdq2ps => "cvtdq2ps",
|
||||||
SseOpcode::Cvtdq2pd => "cvtdq2pd",
|
SseOpcode::Cvtdq2pd => "cvtdq2pd",
|
||||||
|
SseOpcode::Cvtpd2ps => "cvtpd2ps",
|
||||||
|
SseOpcode::Cvtps2pd => "cvtps2pd",
|
||||||
SseOpcode::Cvtsd2ss => "cvtsd2ss",
|
SseOpcode::Cvtsd2ss => "cvtsd2ss",
|
||||||
SseOpcode::Cvtsd2si => "cvtsd2si",
|
SseOpcode::Cvtsd2si => "cvtsd2si",
|
||||||
SseOpcode::Cvtsi2ss => "cvtsi2ss",
|
SseOpcode::Cvtsi2ss => "cvtsi2ss",
|
||||||
|
|||||||
@@ -1348,6 +1348,8 @@ pub(crate) fn emit(
|
|||||||
|
|
||||||
let (prefix, opcode, num_opcodes) = match op {
|
let (prefix, opcode, num_opcodes) = match op {
|
||||||
SseOpcode::Cvtdq2pd => (LegacyPrefixes::_F3, 0x0FE6, 2),
|
SseOpcode::Cvtdq2pd => (LegacyPrefixes::_F3, 0x0FE6, 2),
|
||||||
|
SseOpcode::Cvtpd2ps => (LegacyPrefixes::_66, 0x0F5A, 2),
|
||||||
|
SseOpcode::Cvtps2pd => (LegacyPrefixes::None, 0x0F5A, 2),
|
||||||
SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2),
|
SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2),
|
||||||
SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2),
|
SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2),
|
||||||
SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2),
|
SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2),
|
||||||
|
|||||||
@@ -3913,6 +3913,18 @@ fn test_x64_emit() {
|
|||||||
"vpopcntb %xmm2, %xmm8",
|
"vpopcntb %xmm2, %xmm8",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_unary_rm_r(SseOpcode::Cvtpd2ps, RegMem::reg(xmm7), w_xmm7),
|
||||||
|
"660F5AFF",
|
||||||
|
"cvtpd2ps %xmm7, %xmm7",
|
||||||
|
));
|
||||||
|
|
||||||
|
insns.push((
|
||||||
|
Inst::xmm_unary_rm_r(SseOpcode::Cvtps2pd, RegMem::reg(xmm11), w_xmm9),
|
||||||
|
"450F5ACB",
|
||||||
|
"cvtps2pd %xmm11, %xmm9",
|
||||||
|
));
|
||||||
|
|
||||||
// Xmm to int conversions, and conversely.
|
// Xmm to int conversions, and conversely.
|
||||||
|
|
||||||
insns.push((
|
insns.push((
|
||||||
|
|||||||
@@ -4057,6 +4057,16 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
|
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtss2sd, src, dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Opcode::FvpromoteLow => {
|
||||||
|
let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
|
ctx.emit(Inst::xmm_unary_rm_r(
|
||||||
|
SseOpcode::Cvtps2pd,
|
||||||
|
RegMem::from(src),
|
||||||
|
dst,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
Opcode::Fdemote => {
|
Opcode::Fdemote => {
|
||||||
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
|
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
|
||||||
// must avoid merging a load here.
|
// must avoid merging a load here.
|
||||||
@@ -4065,6 +4075,16 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
|
ctx.emit(Inst::xmm_unary_rm_r(SseOpcode::Cvtsd2ss, src, dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Opcode::Fvdemote => {
|
||||||
|
let src = RegMem::reg(put_input_in_reg(ctx, inputs[0]));
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
|
ctx.emit(Inst::xmm_unary_rm_r(
|
||||||
|
SseOpcode::Cvtpd2ps,
|
||||||
|
RegMem::from(src),
|
||||||
|
dst,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
Opcode::FcvtFromSint => {
|
Opcode::FcvtFromSint => {
|
||||||
let output_ty = ty.unwrap();
|
let output_ty = ty.unwrap();
|
||||||
if !output_ty.is_vector() {
|
if !output_ty.is_vector() {
|
||||||
|
|||||||
@@ -564,6 +564,8 @@ where
|
|||||||
Opcode::FcvtFromUint => unimplemented!("FcvtFromUint"),
|
Opcode::FcvtFromUint => unimplemented!("FcvtFromUint"),
|
||||||
Opcode::FcvtFromSint => unimplemented!("FcvtFromSint"),
|
Opcode::FcvtFromSint => unimplemented!("FcvtFromSint"),
|
||||||
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
|
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
|
||||||
|
Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
|
||||||
|
Opcode::Fvdemote => unimplemented!("Fvdemote"),
|
||||||
Opcode::Isplit => unimplemented!("Isplit"),
|
Opcode::Isplit => unimplemented!("Isplit"),
|
||||||
Opcode::Iconcat => unimplemented!("Iconcat"),
|
Opcode::Iconcat => unimplemented!("Iconcat"),
|
||||||
Opcode::AtomicRmw => unimplemented!("AtomicRmw"),
|
Opcode::AtomicRmw => unimplemented!("AtomicRmw"),
|
||||||
|
|||||||
@@ -1779,6 +1779,14 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
let a = pop1_with_bitcast(state, I32X4, builder);
|
let a = pop1_with_bitcast(state, I32X4, builder);
|
||||||
state.push1(builder.ins().fcvt_low_from_sint(F64X2, a));
|
state.push1(builder.ins().fcvt_low_from_sint(F64X2, a));
|
||||||
}
|
}
|
||||||
|
Operator::F64x2PromoteLowF32x4 => {
|
||||||
|
let a = pop1_with_bitcast(state, F32X4, builder);
|
||||||
|
state.push1(builder.ins().fvpromote_low(a));
|
||||||
|
}
|
||||||
|
Operator::F32x4DemoteF64x2Zero => {
|
||||||
|
let a = pop1_with_bitcast(state, F64X2, builder);
|
||||||
|
state.push1(builder.ins().fvdemote(a));
|
||||||
|
}
|
||||||
Operator::I32x4TruncSatF32x4S => {
|
Operator::I32x4TruncSatF32x4S => {
|
||||||
let a = pop1_with_bitcast(state, F32X4, builder);
|
let a = pop1_with_bitcast(state, F32X4, builder);
|
||||||
state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a))
|
state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a))
|
||||||
@@ -1884,8 +1892,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
| Operator::I16x8ExtAddPairwiseI8x16U
|
| Operator::I16x8ExtAddPairwiseI8x16U
|
||||||
| Operator::I32x4ExtAddPairwiseI16x8S
|
| Operator::I32x4ExtAddPairwiseI16x8S
|
||||||
| Operator::I32x4ExtAddPairwiseI16x8U
|
| Operator::I32x4ExtAddPairwiseI16x8U
|
||||||
| Operator::F32x4DemoteF64x2Zero
|
|
||||||
| Operator::F64x2PromoteLowF32x4
|
|
||||||
| Operator::F64x2ConvertLowI32x4U
|
| Operator::F64x2ConvertLowI32x4U
|
||||||
| Operator::I32x4TruncSatF64x2SZero
|
| Operator::I32x4TruncSatF64x2SZero
|
||||||
| Operator::I32x4TruncSatF64x2UZero => {
|
| Operator::I32x4TruncSatF64x2UZero => {
|
||||||
|
|||||||
Reference in New Issue
Block a user