machinst x64: add basic packed FP arithmetic
Includes instruction definition of packed min/max.
This commit is contained in:
@@ -333,6 +333,8 @@ pub(crate) enum InstructionSet {
|
|||||||
/// Some SSE operations requiring 2 operands r/m and r.
|
/// Some SSE operations requiring 2 operands r/m and r.
|
||||||
#[derive(Clone, Copy, PartialEq)]
|
#[derive(Clone, Copy, PartialEq)]
|
||||||
pub enum SseOpcode {
|
pub enum SseOpcode {
|
||||||
|
Addps,
|
||||||
|
Addpd,
|
||||||
Addss,
|
Addss,
|
||||||
Addsd,
|
Addsd,
|
||||||
Andps,
|
Andps,
|
||||||
@@ -351,11 +353,17 @@ pub enum SseOpcode {
|
|||||||
Cvtss2sd,
|
Cvtss2sd,
|
||||||
Cvttss2si,
|
Cvttss2si,
|
||||||
Cvttsd2si,
|
Cvttsd2si,
|
||||||
|
Divps,
|
||||||
|
Divpd,
|
||||||
Divss,
|
Divss,
|
||||||
Divsd,
|
Divsd,
|
||||||
Insertps,
|
Insertps,
|
||||||
|
Maxps,
|
||||||
|
Maxpd,
|
||||||
Maxss,
|
Maxss,
|
||||||
Maxsd,
|
Maxsd,
|
||||||
|
Minps,
|
||||||
|
Minpd,
|
||||||
Minss,
|
Minss,
|
||||||
Minsd,
|
Minsd,
|
||||||
Movaps,
|
Movaps,
|
||||||
@@ -376,8 +384,12 @@ pub enum SseOpcode {
|
|||||||
Roundss,
|
Roundss,
|
||||||
Roundsd,
|
Roundsd,
|
||||||
Rsqrtss,
|
Rsqrtss,
|
||||||
|
Sqrtps,
|
||||||
|
Sqrtpd,
|
||||||
Sqrtss,
|
Sqrtss,
|
||||||
Sqrtsd,
|
Sqrtsd,
|
||||||
|
Subps,
|
||||||
|
Subpd,
|
||||||
Subss,
|
Subss,
|
||||||
Subsd,
|
Subsd,
|
||||||
Ucomiss,
|
Ucomiss,
|
||||||
@@ -391,14 +403,18 @@ impl SseOpcode {
|
|||||||
pub(crate) fn available_from(&self) -> InstructionSet {
|
pub(crate) fn available_from(&self) -> InstructionSet {
|
||||||
use InstructionSet::*;
|
use InstructionSet::*;
|
||||||
match self {
|
match self {
|
||||||
SseOpcode::Addss
|
SseOpcode::Addps
|
||||||
|
| SseOpcode::Addss
|
||||||
| SseOpcode::Andps
|
| SseOpcode::Andps
|
||||||
| SseOpcode::Andnps
|
| SseOpcode::Andnps
|
||||||
| SseOpcode::Cvtsi2ss
|
| SseOpcode::Cvtsi2ss
|
||||||
| SseOpcode::Cvtss2si
|
| SseOpcode::Cvtss2si
|
||||||
| SseOpcode::Cvttss2si
|
| SseOpcode::Cvttss2si
|
||||||
|
| SseOpcode::Divps
|
||||||
| SseOpcode::Divss
|
| SseOpcode::Divss
|
||||||
|
| SseOpcode::Maxps
|
||||||
| SseOpcode::Maxss
|
| SseOpcode::Maxss
|
||||||
|
| SseOpcode::Minps
|
||||||
| SseOpcode::Minss
|
| SseOpcode::Minss
|
||||||
| SseOpcode::Movaps
|
| SseOpcode::Movaps
|
||||||
| SseOpcode::Movss
|
| SseOpcode::Movss
|
||||||
@@ -408,14 +424,17 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Orps
|
| SseOpcode::Orps
|
||||||
| SseOpcode::Rcpss
|
| SseOpcode::Rcpss
|
||||||
| SseOpcode::Rsqrtss
|
| SseOpcode::Rsqrtss
|
||||||
|
| SseOpcode::Sqrtps
|
||||||
|
| SseOpcode::Sqrtss
|
||||||
|
| SseOpcode::Subps
|
||||||
| SseOpcode::Subss
|
| SseOpcode::Subss
|
||||||
| SseOpcode::Ucomiss
|
| SseOpcode::Ucomiss
|
||||||
| SseOpcode::Sqrtss
|
|
||||||
| SseOpcode::Comiss
|
| SseOpcode::Comiss
|
||||||
| SseOpcode::Cmpss
|
| SseOpcode::Cmpss
|
||||||
| SseOpcode::Xorps => SSE,
|
| SseOpcode::Xorps => SSE,
|
||||||
|
|
||||||
SseOpcode::Addsd
|
SseOpcode::Addpd
|
||||||
|
| SseOpcode::Addsd
|
||||||
| SseOpcode::Andpd
|
| SseOpcode::Andpd
|
||||||
| SseOpcode::Andnpd
|
| SseOpcode::Andnpd
|
||||||
| SseOpcode::Cvtsd2ss
|
| SseOpcode::Cvtsd2ss
|
||||||
@@ -423,8 +442,11 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Cvtsi2sd
|
| SseOpcode::Cvtsi2sd
|
||||||
| SseOpcode::Cvtss2sd
|
| SseOpcode::Cvtss2sd
|
||||||
| SseOpcode::Cvttsd2si
|
| SseOpcode::Cvttsd2si
|
||||||
|
| SseOpcode::Divpd
|
||||||
| SseOpcode::Divsd
|
| SseOpcode::Divsd
|
||||||
|
| SseOpcode::Maxpd
|
||||||
| SseOpcode::Maxsd
|
| SseOpcode::Maxsd
|
||||||
|
| SseOpcode::Minpd
|
||||||
| SseOpcode::Minsd
|
| SseOpcode::Minsd
|
||||||
| SseOpcode::Movapd
|
| SseOpcode::Movapd
|
||||||
| SseOpcode::Movd
|
| SseOpcode::Movd
|
||||||
@@ -434,7 +456,9 @@ impl SseOpcode {
|
|||||||
| SseOpcode::Mulpd
|
| SseOpcode::Mulpd
|
||||||
| SseOpcode::Mulsd
|
| SseOpcode::Mulsd
|
||||||
| SseOpcode::Orpd
|
| SseOpcode::Orpd
|
||||||
|
| SseOpcode::Sqrtpd
|
||||||
| SseOpcode::Sqrtsd
|
| SseOpcode::Sqrtsd
|
||||||
|
| SseOpcode::Subpd
|
||||||
| SseOpcode::Subsd
|
| SseOpcode::Subsd
|
||||||
| SseOpcode::Ucomisd
|
| SseOpcode::Ucomisd
|
||||||
| SseOpcode::Comisd
|
| SseOpcode::Comisd
|
||||||
@@ -457,6 +481,8 @@ impl SseOpcode {
|
|||||||
impl fmt::Debug for SseOpcode {
|
impl fmt::Debug for SseOpcode {
|
||||||
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
|
||||||
let name = match self {
|
let name = match self {
|
||||||
|
SseOpcode::Addps => "addps",
|
||||||
|
SseOpcode::Addpd => "addpd",
|
||||||
SseOpcode::Addss => "addss",
|
SseOpcode::Addss => "addss",
|
||||||
SseOpcode::Addsd => "addsd",
|
SseOpcode::Addsd => "addsd",
|
||||||
SseOpcode::Andpd => "andpd",
|
SseOpcode::Andpd => "andpd",
|
||||||
@@ -473,10 +499,16 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Cvtss2sd => "cvtss2sd",
|
SseOpcode::Cvtss2sd => "cvtss2sd",
|
||||||
SseOpcode::Cvttss2si => "cvttss2si",
|
SseOpcode::Cvttss2si => "cvttss2si",
|
||||||
SseOpcode::Cvttsd2si => "cvttsd2si",
|
SseOpcode::Cvttsd2si => "cvttsd2si",
|
||||||
|
SseOpcode::Divps => "divps",
|
||||||
|
SseOpcode::Divpd => "divpd",
|
||||||
SseOpcode::Divss => "divss",
|
SseOpcode::Divss => "divss",
|
||||||
SseOpcode::Divsd => "divsd",
|
SseOpcode::Divsd => "divsd",
|
||||||
|
SseOpcode::Maxps => "maxps",
|
||||||
|
SseOpcode::Maxpd => "maxpd",
|
||||||
SseOpcode::Maxss => "maxss",
|
SseOpcode::Maxss => "maxss",
|
||||||
SseOpcode::Maxsd => "maxsd",
|
SseOpcode::Maxsd => "maxsd",
|
||||||
|
SseOpcode::Minps => "minps",
|
||||||
|
SseOpcode::Minpd => "minpd",
|
||||||
SseOpcode::Minss => "minss",
|
SseOpcode::Minss => "minss",
|
||||||
SseOpcode::Minsd => "minsd",
|
SseOpcode::Minsd => "minsd",
|
||||||
SseOpcode::Movaps => "movaps",
|
SseOpcode::Movaps => "movaps",
|
||||||
@@ -497,8 +529,12 @@ impl fmt::Debug for SseOpcode {
|
|||||||
SseOpcode::Roundss => "roundss",
|
SseOpcode::Roundss => "roundss",
|
||||||
SseOpcode::Roundsd => "roundsd",
|
SseOpcode::Roundsd => "roundsd",
|
||||||
SseOpcode::Rsqrtss => "rsqrtss",
|
SseOpcode::Rsqrtss => "rsqrtss",
|
||||||
|
SseOpcode::Sqrtps => "sqrtps",
|
||||||
|
SseOpcode::Sqrtpd => "sqrtpd",
|
||||||
SseOpcode::Sqrtss => "sqrtss",
|
SseOpcode::Sqrtss => "sqrtss",
|
||||||
SseOpcode::Sqrtsd => "sqrtsd",
|
SseOpcode::Sqrtsd => "sqrtsd",
|
||||||
|
SseOpcode::Subps => "subps",
|
||||||
|
SseOpcode::Subpd => "subpd",
|
||||||
SseOpcode::Subss => "subss",
|
SseOpcode::Subss => "subss",
|
||||||
SseOpcode::Subsd => "subsd",
|
SseOpcode::Subsd => "subsd",
|
||||||
SseOpcode::Ucomiss => "ucomiss",
|
SseOpcode::Ucomiss => "ucomiss",
|
||||||
|
|||||||
@@ -1585,24 +1585,36 @@ pub(crate) fn emit(
|
|||||||
} => {
|
} => {
|
||||||
let rex = RexFlags::clear_w();
|
let rex = RexFlags::clear_w();
|
||||||
let (prefix, opcode) = match op {
|
let (prefix, opcode) = match op {
|
||||||
|
SseOpcode::Addps => (LegacyPrefix::None, 0x0F58),
|
||||||
|
SseOpcode::Addpd => (LegacyPrefix::_66, 0x0F58),
|
||||||
SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58),
|
SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58),
|
||||||
SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58),
|
SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58),
|
||||||
SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54),
|
SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54),
|
||||||
SseOpcode::Andps => (LegacyPrefix::None, 0x0F54),
|
SseOpcode::Andps => (LegacyPrefix::None, 0x0F54),
|
||||||
SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55),
|
SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55),
|
||||||
SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55),
|
SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55),
|
||||||
|
SseOpcode::Divps => (LegacyPrefix::None, 0x0F5E),
|
||||||
|
SseOpcode::Divpd => (LegacyPrefix::_66, 0x0F5E),
|
||||||
|
SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E),
|
||||||
|
SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E),
|
||||||
|
SseOpcode::Minps => (LegacyPrefix::None, 0x0F5D),
|
||||||
|
SseOpcode::Minpd => (LegacyPrefix::_66, 0x0F5D),
|
||||||
|
SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D),
|
||||||
|
SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D),
|
||||||
|
SseOpcode::Maxps => (LegacyPrefix::None, 0x0F5F),
|
||||||
|
SseOpcode::Maxpd => (LegacyPrefix::_66, 0x0F5F),
|
||||||
|
SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F),
|
||||||
|
SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F),
|
||||||
|
SseOpcode::Mulps => (LegacyPrefix::None, 0x0F59),
|
||||||
|
SseOpcode::Mulpd => (LegacyPrefix::_66, 0x0F59),
|
||||||
SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59),
|
SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59),
|
||||||
SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59),
|
SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59),
|
||||||
SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56),
|
SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56),
|
||||||
SseOpcode::Orps => (LegacyPrefix::None, 0x0F56),
|
SseOpcode::Orps => (LegacyPrefix::None, 0x0F56),
|
||||||
|
SseOpcode::Subps => (LegacyPrefix::None, 0x0F5C),
|
||||||
|
SseOpcode::Subpd => (LegacyPrefix::_66, 0x0F5C),
|
||||||
SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C),
|
SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C),
|
||||||
SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C),
|
SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C),
|
||||||
SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D),
|
|
||||||
SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D),
|
|
||||||
SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E),
|
|
||||||
SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E),
|
|
||||||
SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F),
|
|
||||||
SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F),
|
|
||||||
SseOpcode::Xorps => (LegacyPrefix::None, 0x0F57),
|
SseOpcode::Xorps => (LegacyPrefix::None, 0x0F57),
|
||||||
SseOpcode::Xorpd => (LegacyPrefix::_66, 0x0F57),
|
SseOpcode::Xorpd => (LegacyPrefix::_66, 0x0F57),
|
||||||
_ => unimplemented!("Opcode {:?} not implemented", op),
|
_ => unimplemented!("Opcode {:?} not implemented", op),
|
||||||
|
|||||||
@@ -51,14 +51,6 @@ fn is_bool_ty(ty: Type) -> bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_float_ty(ty: Type) -> bool {
|
|
||||||
match ty {
|
|
||||||
types::F32 | types::F64 => true,
|
|
||||||
types::R32 => panic!("shouldn't have 32-bits refs on x64"),
|
|
||||||
_ => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn int_ty_is_64(ty: Type) -> bool {
|
fn int_ty_is_64(ty: Type) -> bool {
|
||||||
match ty {
|
match ty {
|
||||||
types::I8 | types::I16 | types::I32 => false,
|
types::I8 | types::I16 | types::I32 => false,
|
||||||
@@ -67,14 +59,6 @@ fn int_ty_is_64(ty: Type) -> bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn flt_ty_is_64(ty: Type) -> bool {
|
|
||||||
match ty {
|
|
||||||
types::F32 => false,
|
|
||||||
types::F64 => true,
|
|
||||||
_ => panic!("type {} is none of F32, F64", ty),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn iri_to_u64_imm(ctx: Ctx, inst: IRInst) -> Option<u64> {
|
fn iri_to_u64_imm(ctx: Ctx, inst: IRInst) -> Option<u64> {
|
||||||
ctx.get_constant(inst)
|
ctx.get_constant(inst)
|
||||||
}
|
}
|
||||||
@@ -1081,32 +1065,54 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
|
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
|
||||||
let lhs = input_to_reg_mem(ctx, inputs[0]);
|
let lhs = input_to_reg(ctx, inputs[0]);
|
||||||
let rhs = input_to_reg(ctx, inputs[1]);
|
let rhs = input_to_reg_mem(ctx, inputs[1]);
|
||||||
let dst = output_to_reg(ctx, outputs[0]);
|
let dst = output_to_reg(ctx, outputs[0]);
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
|
||||||
|
// Move the `lhs` to the same register as `dst`; this may not emit an actual move
|
||||||
|
// but ensures that the registers are the same to match x86's read-write operand
|
||||||
|
// encoding.
|
||||||
|
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
||||||
|
|
||||||
// Note: min and max can't be handled here, because of the way Cranelift defines them:
|
// Note: min and max can't be handled here, because of the way Cranelift defines them:
|
||||||
// if any operand is a NaN, they must return the NaN operand, while the x86 machine
|
// if any operand is a NaN, they must return the NaN operand, while the x86 machine
|
||||||
// instruction will return the other operand.
|
// instruction will return the second operand if either operand is a NaN.
|
||||||
let (f32_op, f64_op) = match op {
|
let sse_op = match ty {
|
||||||
Opcode::Fadd => (SseOpcode::Addss, SseOpcode::Addsd),
|
types::F32 => match op {
|
||||||
Opcode::Fsub => (SseOpcode::Subss, SseOpcode::Subsd),
|
Opcode::Fadd => SseOpcode::Addss,
|
||||||
Opcode::Fmul => (SseOpcode::Mulss, SseOpcode::Mulsd),
|
Opcode::Fsub => SseOpcode::Subss,
|
||||||
Opcode::Fdiv => (SseOpcode::Divss, SseOpcode::Divsd),
|
Opcode::Fmul => SseOpcode::Mulss,
|
||||||
|
Opcode::Fdiv => SseOpcode::Divss,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
types::F64 => match op {
|
||||||
|
Opcode::Fadd => SseOpcode::Addsd,
|
||||||
|
Opcode::Fsub => SseOpcode::Subsd,
|
||||||
|
Opcode::Fmul => SseOpcode::Mulsd,
|
||||||
|
Opcode::Fdiv => SseOpcode::Divsd,
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
types::F32X4 => match op {
|
||||||
|
Opcode::Fadd => SseOpcode::Addps,
|
||||||
|
Opcode::Fsub => SseOpcode::Subps,
|
||||||
|
Opcode::Fmul => SseOpcode::Mulps,
|
||||||
|
Opcode::Fdiv => SseOpcode::Divps,
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
types::F64X2 => match op {
|
||||||
|
Opcode::Fadd => SseOpcode::Addpd,
|
||||||
|
Opcode::Fsub => SseOpcode::Subpd,
|
||||||
|
Opcode::Fmul => SseOpcode::Mulpd,
|
||||||
|
Opcode::Fdiv => SseOpcode::Divpd,
|
||||||
|
_ => unreachable!(),
|
||||||
|
},
|
||||||
|
_ => panic!(
|
||||||
|
"invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
|
||||||
|
ty
|
||||||
|
),
|
||||||
};
|
};
|
||||||
|
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
|
||||||
let is_64 = flt_ty_is_64(ty.unwrap());
|
|
||||||
|
|
||||||
let mov_op = if is_64 {
|
|
||||||
SseOpcode::Movsd
|
|
||||||
} else {
|
|
||||||
SseOpcode::Movss
|
|
||||||
};
|
|
||||||
ctx.emit(Inst::xmm_mov(mov_op, lhs, dst, None));
|
|
||||||
|
|
||||||
let sse_op = if is_64 { f64_op } else { f32_op };
|
|
||||||
ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::reg(rhs), dst));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Opcode::Fmin | Opcode::Fmax => {
|
Opcode::Fmin | Opcode::Fmax => {
|
||||||
@@ -1127,17 +1133,19 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
|||||||
Opcode::Sqrt => {
|
Opcode::Sqrt => {
|
||||||
let src = input_to_reg_mem(ctx, inputs[0]);
|
let src = input_to_reg_mem(ctx, inputs[0]);
|
||||||
let dst = output_to_reg(ctx, outputs[0]);
|
let dst = output_to_reg(ctx, outputs[0]);
|
||||||
|
let ty = ty.unwrap();
|
||||||
|
|
||||||
let (f32_op, f64_op) = match op {
|
let sse_op = match ty {
|
||||||
Opcode::Sqrt => (SseOpcode::Sqrtss, SseOpcode::Sqrtsd),
|
types::F32 => SseOpcode::Sqrtss,
|
||||||
_ => unreachable!(),
|
types::F64 => SseOpcode::Sqrtsd,
|
||||||
|
types::F32X4 => SseOpcode::Sqrtps,
|
||||||
|
types::F64X2 => SseOpcode::Sqrtpd,
|
||||||
|
_ => panic!(
|
||||||
|
"invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
|
||||||
|
ty
|
||||||
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
let sse_op = if flt_ty_is_64(ty.unwrap()) {
|
|
||||||
f64_op
|
|
||||||
} else {
|
|
||||||
f32_op
|
|
||||||
};
|
|
||||||
ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst));
|
ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user