Fold fcvt_low_from_uinit into previously existing clif instructions
This commit is contained in:
@@ -4457,28 +4457,6 @@ pub(crate) fn define(
|
|||||||
.operands_out(vec![a]),
|
.operands_out(vec![a]),
|
||||||
);
|
);
|
||||||
|
|
||||||
ig.push(
|
|
||||||
Inst::new(
|
|
||||||
"fcvt_low_from_uint",
|
|
||||||
r#"
|
|
||||||
|
|
||||||
Converts packed unsigned 32-bit integers to packed double precision floating point.
|
|
||||||
|
|
||||||
Considering only the low half of the register, each lane in `x` is interpreted as a
|
|
||||||
unsigned 32-bit integer that is then converted to a double precision float. This
|
|
||||||
instruction differs from fcvt_from_uint in that it converts half the number of lanes
|
|
||||||
which are converted to occupy twice the number of bits. No rounding should be needed
|
|
||||||
for the resulting float.
|
|
||||||
|
|
||||||
The result type will have half the number of vector lanes as the input.
|
|
||||||
|
|
||||||
"#,
|
|
||||||
&formats.unary,
|
|
||||||
)
|
|
||||||
.operands_in(vec![x])
|
|
||||||
.operands_out(vec![a]),
|
|
||||||
);
|
|
||||||
|
|
||||||
let WideInt = &TypeVar::new(
|
let WideInt = &TypeVar::new(
|
||||||
"WideInt",
|
"WideInt",
|
||||||
"An integer type with lanes from `i16` upwards",
|
"An integer type with lanes from `i16` upwards",
|
||||||
|
|||||||
@@ -3557,7 +3557,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
|
|
||||||
Opcode::ConstAddr
|
Opcode::ConstAddr
|
||||||
| Opcode::FcvtLowFromSint
|
| Opcode::FcvtLowFromSint
|
||||||
| Opcode::FcvtLowFromUint
|
|
||||||
| Opcode::Fvdemote
|
| Opcode::Fvdemote
|
||||||
| Opcode::FvpromoteLow
|
| Opcode::FvpromoteLow
|
||||||
| Opcode::Vconcat
|
| Opcode::Vconcat
|
||||||
|
|||||||
@@ -2867,7 +2867,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
| Opcode::UwidenHigh
|
| Opcode::UwidenHigh
|
||||||
| Opcode::WideningPairwiseDotProductS
|
| Opcode::WideningPairwiseDotProductS
|
||||||
| Opcode::SqmulRoundSat
|
| Opcode::SqmulRoundSat
|
||||||
| Opcode::FcvtLowFromUint
|
|
||||||
| Opcode::FvpromoteLow
|
| Opcode::FvpromoteLow
|
||||||
| Opcode::Fvdemote => {
|
| Opcode::Fvdemote => {
|
||||||
// TODO
|
// TODO
|
||||||
|
|||||||
@@ -4154,58 +4154,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
dst,
|
dst,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
Opcode::FcvtLowFromUint => {
|
|
||||||
// Algorithm uses unpcklps to help create a float that is equivalent
|
|
||||||
// 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
|
|
||||||
// every value of the mantissa represents a corresponding uint32 number.
|
|
||||||
// When we subtract 0x1.0p52 we are left with double(src).
|
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
|
||||||
let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
|
||||||
|
|
||||||
ctx.emit(Inst::gen_move(dst, src, types::I32X4));
|
|
||||||
|
|
||||||
static UINT_MASK: [u8; 16] = [
|
|
||||||
0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0x00, 0x00,
|
|
||||||
];
|
|
||||||
|
|
||||||
let uint_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
|
|
||||||
|
|
||||||
ctx.emit(Inst::xmm_load_const(
|
|
||||||
uint_mask_const,
|
|
||||||
uint_mask,
|
|
||||||
types::I32X4,
|
|
||||||
));
|
|
||||||
|
|
||||||
// Creates 0x1.0p52 + double(src)
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Unpcklps,
|
|
||||||
RegMem::from(uint_mask),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
|
|
||||||
static UINT_MASK_HIGH: [u8; 16] = [
|
|
||||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0x30, 0x43,
|
|
||||||
];
|
|
||||||
|
|
||||||
let uint_mask_high_const =
|
|
||||||
ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
|
|
||||||
let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
|
||||||
ctx.emit(Inst::xmm_load_const(
|
|
||||||
uint_mask_high_const,
|
|
||||||
uint_mask_high,
|
|
||||||
types::I32X4,
|
|
||||||
));
|
|
||||||
|
|
||||||
// 0x1.0p52 + double(src) - 0x1.0p52
|
|
||||||
ctx.emit(Inst::xmm_rm_r(
|
|
||||||
SseOpcode::Subpd,
|
|
||||||
RegMem::from(uint_mask_high),
|
|
||||||
dst,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
Opcode::FcvtFromUint => {
|
Opcode::FcvtFromUint => {
|
||||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
let ty = ty.unwrap();
|
let ty = ty.unwrap();
|
||||||
@@ -4253,6 +4201,67 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
_ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
|
_ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
|
||||||
};
|
};
|
||||||
|
} else if let Some(uwiden) = matches_input(ctx, inputs[0], Opcode::UwidenLow) {
|
||||||
|
let uwiden_input = InsnInput {
|
||||||
|
insn: uwiden,
|
||||||
|
input: 0,
|
||||||
|
};
|
||||||
|
let src = put_input_in_reg(ctx, uwiden_input);
|
||||||
|
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||||
|
let input_ty = ctx.input_ty(uwiden, 0);
|
||||||
|
let output_ty = ctx.output_ty(insn, 0);
|
||||||
|
|
||||||
|
// Matches_input further obfuscates which Wasm instruction this is ultimately
|
||||||
|
// lowering. Check here that the types are as expected for F64x2ConvertLowI32x4U.
|
||||||
|
debug_assert!(input_ty == types::I32X4 || output_ty == types::F64X2);
|
||||||
|
|
||||||
|
// Algorithm uses unpcklps to help create a float that is equivalent
|
||||||
|
// 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
|
||||||
|
// every value of the mantissa represents a corresponding uint32 number.
|
||||||
|
// When we subtract 0x1.0p52 we are left with double(src).
|
||||||
|
let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
||||||
|
ctx.emit(Inst::gen_move(dst, src, types::I32X4));
|
||||||
|
|
||||||
|
static UINT_MASK: [u8; 16] = [
|
||||||
|
0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x00, 0x00,
|
||||||
|
];
|
||||||
|
|
||||||
|
let uint_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
|
||||||
|
|
||||||
|
ctx.emit(Inst::xmm_load_const(
|
||||||
|
uint_mask_const,
|
||||||
|
uint_mask,
|
||||||
|
types::I32X4,
|
||||||
|
));
|
||||||
|
|
||||||
|
// Creates 0x1.0p52 + double(src)
|
||||||
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Unpcklps,
|
||||||
|
RegMem::from(uint_mask),
|
||||||
|
dst,
|
||||||
|
));
|
||||||
|
|
||||||
|
static UINT_MASK_HIGH: [u8; 16] = [
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
0x00, 0x30, 0x43,
|
||||||
|
];
|
||||||
|
|
||||||
|
let uint_mask_high_const =
|
||||||
|
ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
|
||||||
|
let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
|
||||||
|
ctx.emit(Inst::xmm_load_const(
|
||||||
|
uint_mask_high_const,
|
||||||
|
uint_mask_high,
|
||||||
|
types::I32X4,
|
||||||
|
));
|
||||||
|
|
||||||
|
// 0x1.0p52 + double(src) - 0x1.0p52
|
||||||
|
ctx.emit(Inst::xmm_rm_r(
|
||||||
|
SseOpcode::Subpd,
|
||||||
|
RegMem::from(uint_mask_high),
|
||||||
|
dst,
|
||||||
|
));
|
||||||
} else {
|
} else {
|
||||||
assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
|
assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
|
||||||
let src = put_input_in_reg(ctx, inputs[0]);
|
let src = put_input_in_reg(ctx, inputs[0]);
|
||||||
@@ -4595,7 +4604,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
(types::I16X8, types::I32X4) => {
|
(types::I16X8, types::I32X4) => {
|
||||||
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::reg(src), dst));
|
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::reg(src), dst));
|
||||||
}
|
}
|
||||||
_ => unreachable!(),
|
_ => unreachable!(
|
||||||
|
"In UwidenLow: input_ty {:?}, output_ty {:?}",
|
||||||
|
input_ty, output_ty
|
||||||
|
),
|
||||||
},
|
},
|
||||||
Opcode::UwidenHigh => match (input_ty, output_ty) {
|
Opcode::UwidenHigh => match (input_ty, output_ty) {
|
||||||
(types::I8X16, types::I16X8) => {
|
(types::I8X16, types::I16X8) => {
|
||||||
|
|||||||
@@ -565,7 +565,6 @@ where
|
|||||||
Opcode::FcvtFromUint => unimplemented!("FcvtFromUint"),
|
Opcode::FcvtFromUint => unimplemented!("FcvtFromUint"),
|
||||||
Opcode::FcvtFromSint => unimplemented!("FcvtFromSint"),
|
Opcode::FcvtFromSint => unimplemented!("FcvtFromSint"),
|
||||||
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
|
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
|
||||||
Opcode::FcvtLowFromUint => unimplemented!("FcvtLowFromUint"),
|
|
||||||
Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
|
Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
|
||||||
Opcode::Fvdemote => unimplemented!("Fvdemote"),
|
Opcode::Fvdemote => unimplemented!("Fvdemote"),
|
||||||
Opcode::Isplit => unimplemented!("Isplit"),
|
Opcode::Isplit => unimplemented!("Isplit"),
|
||||||
|
|||||||
@@ -1780,7 +1780,8 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
|||||||
}
|
}
|
||||||
Operator::F64x2ConvertLowI32x4U => {
|
Operator::F64x2ConvertLowI32x4U => {
|
||||||
let a = pop1_with_bitcast(state, I32X4, builder);
|
let a = pop1_with_bitcast(state, I32X4, builder);
|
||||||
state.push1(builder.ins().fcvt_low_from_uint(F64X2, a));
|
let widened_a = builder.ins().uwiden_low(a);
|
||||||
|
state.push1(builder.ins().fcvt_from_uint(F64X2, widened_a));
|
||||||
}
|
}
|
||||||
Operator::F64x2PromoteLowF32x4 => {
|
Operator::F64x2PromoteLowF32x4 => {
|
||||||
let a = pop1_with_bitcast(state, F32X4, builder);
|
let a = pop1_with_bitcast(state, F32X4, builder);
|
||||||
|
|||||||
Reference in New Issue
Block a user