Fold fcvt_low_from_uinit into previously existing clif instructions

This commit is contained in:
Johnnie Birch
2021-07-08 07:15:54 -07:00
parent 6dd2df4fb3
commit d8e813204e
6 changed files with 67 additions and 79 deletions

View File

@@ -4457,28 +4457,6 @@ pub(crate) fn define(
.operands_out(vec![a]), .operands_out(vec![a]),
); );
ig.push(
Inst::new(
"fcvt_low_from_uint",
r#"
Converts packed unsigned 32-bit integers to packed double precision floating point.
Considering only the low half of the register, each lane in `x` is interpreted as a
unsigned 32-bit integer that is then converted to a double precision float. This
instruction differs from fcvt_from_uint in that it converts half the number of lanes
which are converted to occupy twice the number of bits. No rounding should be needed
for the resulting float.
The result type will have half the number of vector lanes as the input.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![a]),
);
let WideInt = &TypeVar::new( let WideInt = &TypeVar::new(
"WideInt", "WideInt",
"An integer type with lanes from `i16` upwards", "An integer type with lanes from `i16` upwards",

View File

@@ -3557,7 +3557,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::ConstAddr Opcode::ConstAddr
| Opcode::FcvtLowFromSint | Opcode::FcvtLowFromSint
| Opcode::FcvtLowFromUint
| Opcode::Fvdemote | Opcode::Fvdemote
| Opcode::FvpromoteLow | Opcode::FvpromoteLow
| Opcode::Vconcat | Opcode::Vconcat

View File

@@ -2867,7 +2867,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::UwidenHigh | Opcode::UwidenHigh
| Opcode::WideningPairwiseDotProductS | Opcode::WideningPairwiseDotProductS
| Opcode::SqmulRoundSat | Opcode::SqmulRoundSat
| Opcode::FcvtLowFromUint
| Opcode::FvpromoteLow | Opcode::FvpromoteLow
| Opcode::Fvdemote => { | Opcode::Fvdemote => {
// TODO // TODO

View File

@@ -4154,58 +4154,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
dst, dst,
)); ));
} }
Opcode::FcvtLowFromUint => {
// Algorithm uses unpcklps to help create a float that is equivalent
// 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
// every value of the mantissa represents a corresponding uint32 number.
// When we subtract 0x1.0p52 we are left with double(src).
let src = put_input_in_reg(ctx, inputs[0]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::gen_move(dst, src, types::I32X4));
static UINT_MASK: [u8; 16] = [
0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00,
];
let uint_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
ctx.emit(Inst::xmm_load_const(
uint_mask_const,
uint_mask,
types::I32X4,
));
// Creates 0x1.0p52 + double(src)
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Unpcklps,
RegMem::from(uint_mask),
dst,
));
static UINT_MASK_HIGH: [u8; 16] = [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x30, 0x43,
];
let uint_mask_high_const =
ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(
uint_mask_high_const,
uint_mask_high,
types::I32X4,
));
// 0x1.0p52 + double(src) - 0x1.0p52
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Subpd,
RegMem::from(uint_mask_high),
dst,
));
}
Opcode::FcvtFromUint => { Opcode::FcvtFromUint => {
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ty = ty.unwrap(); let ty = ty.unwrap();
@@ -4253,6 +4201,67 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
_ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty), _ => panic!("unexpected input type for FcvtFromUint: {:?}", input_ty),
}; };
} else if let Some(uwiden) = matches_input(ctx, inputs[0], Opcode::UwidenLow) {
let uwiden_input = InsnInput {
insn: uwiden,
input: 0,
};
let src = put_input_in_reg(ctx, uwiden_input);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let input_ty = ctx.input_ty(uwiden, 0);
let output_ty = ctx.output_ty(insn, 0);
// Matches_input further obfuscates which Wasm instruction this is ultimately
// lowering. Check here that the types are as expected for F64x2ConvertLowI32x4U.
debug_assert!(input_ty == types::I32X4 || output_ty == types::F64X2);
// Algorithm uses unpcklps to help create a float that is equivalent
// 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
// every value of the mantissa represents a corresponding uint32 number.
// When we subtract 0x1.0p52 we are left with double(src).
let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::gen_move(dst, src, types::I32X4));
static UINT_MASK: [u8; 16] = [
0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00,
];
let uint_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
ctx.emit(Inst::xmm_load_const(
uint_mask_const,
uint_mask,
types::I32X4,
));
// Creates 0x1.0p52 + double(src)
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Unpcklps,
RegMem::from(uint_mask),
dst,
));
static UINT_MASK_HIGH: [u8; 16] = [
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x30, 0x43,
];
let uint_mask_high_const =
ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
ctx.emit(Inst::xmm_load_const(
uint_mask_high_const,
uint_mask_high,
types::I32X4,
));
// 0x1.0p52 + double(src) - 0x1.0p52
ctx.emit(Inst::xmm_rm_r(
SseOpcode::Subpd,
RegMem::from(uint_mask_high),
dst,
));
} else { } else {
assert_eq!(ctx.input_ty(insn, 0), types::I32X4); assert_eq!(ctx.input_ty(insn, 0), types::I32X4);
let src = put_input_in_reg(ctx, inputs[0]); let src = put_input_in_reg(ctx, inputs[0]);
@@ -4595,7 +4604,10 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
(types::I16X8, types::I32X4) => { (types::I16X8, types::I32X4) => {
ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::reg(src), dst)); ctx.emit(Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::reg(src), dst));
} }
_ => unreachable!(), _ => unreachable!(
"In UwidenLow: input_ty {:?}, output_ty {:?}",
input_ty, output_ty
),
}, },
Opcode::UwidenHigh => match (input_ty, output_ty) { Opcode::UwidenHigh => match (input_ty, output_ty) {
(types::I8X16, types::I16X8) => { (types::I8X16, types::I16X8) => {

View File

@@ -565,7 +565,6 @@ where
Opcode::FcvtFromUint => unimplemented!("FcvtFromUint"), Opcode::FcvtFromUint => unimplemented!("FcvtFromUint"),
Opcode::FcvtFromSint => unimplemented!("FcvtFromSint"), Opcode::FcvtFromSint => unimplemented!("FcvtFromSint"),
Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"), Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
Opcode::FcvtLowFromUint => unimplemented!("FcvtLowFromUint"),
Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"), Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
Opcode::Fvdemote => unimplemented!("Fvdemote"), Opcode::Fvdemote => unimplemented!("Fvdemote"),
Opcode::Isplit => unimplemented!("Isplit"), Opcode::Isplit => unimplemented!("Isplit"),

View File

@@ -1780,7 +1780,8 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
} }
Operator::F64x2ConvertLowI32x4U => { Operator::F64x2ConvertLowI32x4U => {
let a = pop1_with_bitcast(state, I32X4, builder); let a = pop1_with_bitcast(state, I32X4, builder);
state.push1(builder.ins().fcvt_low_from_uint(F64X2, a)); let widened_a = builder.ins().uwiden_low(a);
state.push1(builder.ins().fcvt_from_uint(F64X2, widened_a));
} }
Operator::F64x2PromoteLowF32x4 => { Operator::F64x2PromoteLowF32x4 => {
let a = pop1_with_bitcast(state, F32X4, builder); let a = pop1_with_bitcast(state, F32X4, builder);