diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 634d4eb6ea..2351ac8899 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -333,6 +333,8 @@ pub(crate) enum InstructionSet { /// Some SSE operations requiring 2 operands r/m and r. #[derive(Clone, Copy, PartialEq)] pub enum SseOpcode { + Addps, + Addpd, Addss, Addsd, Andps, @@ -351,11 +353,17 @@ pub enum SseOpcode { Cvtss2sd, Cvttss2si, Cvttsd2si, + Divps, + Divpd, Divss, Divsd, Insertps, + Maxps, + Maxpd, Maxss, Maxsd, + Minps, + Minpd, Minss, Minsd, Movaps, @@ -376,8 +384,12 @@ pub enum SseOpcode { Roundss, Roundsd, Rsqrtss, + Sqrtps, + Sqrtpd, Sqrtss, Sqrtsd, + Subps, + Subpd, Subss, Subsd, Ucomiss, @@ -391,14 +403,18 @@ impl SseOpcode { pub(crate) fn available_from(&self) -> InstructionSet { use InstructionSet::*; match self { - SseOpcode::Addss + SseOpcode::Addps + | SseOpcode::Addss | SseOpcode::Andps | SseOpcode::Andnps | SseOpcode::Cvtsi2ss | SseOpcode::Cvtss2si | SseOpcode::Cvttss2si + | SseOpcode::Divps | SseOpcode::Divss + | SseOpcode::Maxps | SseOpcode::Maxss + | SseOpcode::Minps | SseOpcode::Minss | SseOpcode::Movaps | SseOpcode::Movss @@ -408,14 +424,17 @@ impl SseOpcode { | SseOpcode::Orps | SseOpcode::Rcpss | SseOpcode::Rsqrtss + | SseOpcode::Sqrtps + | SseOpcode::Sqrtss + | SseOpcode::Subps | SseOpcode::Subss | SseOpcode::Ucomiss - | SseOpcode::Sqrtss | SseOpcode::Comiss | SseOpcode::Cmpss | SseOpcode::Xorps => SSE, - SseOpcode::Addsd + SseOpcode::Addpd + | SseOpcode::Addsd | SseOpcode::Andpd | SseOpcode::Andnpd | SseOpcode::Cvtsd2ss @@ -423,8 +442,11 @@ impl SseOpcode { | SseOpcode::Cvtsi2sd | SseOpcode::Cvtss2sd | SseOpcode::Cvttsd2si + | SseOpcode::Divpd | SseOpcode::Divsd + | SseOpcode::Maxpd | SseOpcode::Maxsd + | SseOpcode::Minpd | SseOpcode::Minsd | SseOpcode::Movapd | SseOpcode::Movd @@ -434,7 +456,9 @@ impl SseOpcode { | SseOpcode::Mulpd | SseOpcode::Mulsd | SseOpcode::Orpd + | SseOpcode::Sqrtpd | SseOpcode::Sqrtsd + | SseOpcode::Subpd | SseOpcode::Subsd | SseOpcode::Ucomisd | SseOpcode::Comisd @@ -457,6 +481,8 @@ impl SseOpcode { impl fmt::Debug for SseOpcode { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { let name = match self { + SseOpcode::Addps => "addps", + SseOpcode::Addpd => "addpd", SseOpcode::Addss => "addss", SseOpcode::Addsd => "addsd", SseOpcode::Andpd => "andpd", @@ -473,10 +499,16 @@ impl fmt::Debug for SseOpcode { SseOpcode::Cvtss2sd => "cvtss2sd", SseOpcode::Cvttss2si => "cvttss2si", SseOpcode::Cvttsd2si => "cvttsd2si", + SseOpcode::Divps => "divps", + SseOpcode::Divpd => "divpd", SseOpcode::Divss => "divss", SseOpcode::Divsd => "divsd", + SseOpcode::Maxps => "maxps", + SseOpcode::Maxpd => "maxpd", SseOpcode::Maxss => "maxss", SseOpcode::Maxsd => "maxsd", + SseOpcode::Minps => "minps", + SseOpcode::Minpd => "minpd", SseOpcode::Minss => "minss", SseOpcode::Minsd => "minsd", SseOpcode::Movaps => "movaps", @@ -497,8 +529,12 @@ impl fmt::Debug for SseOpcode { SseOpcode::Roundss => "roundss", SseOpcode::Roundsd => "roundsd", SseOpcode::Rsqrtss => "rsqrtss", + SseOpcode::Sqrtps => "sqrtps", + SseOpcode::Sqrtpd => "sqrtpd", SseOpcode::Sqrtss => "sqrtss", SseOpcode::Sqrtsd => "sqrtsd", + SseOpcode::Subps => "subps", + SseOpcode::Subpd => "subpd", SseOpcode::Subss => "subss", SseOpcode::Subsd => "subsd", SseOpcode::Ucomiss => "ucomiss", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 49cc20acae..42c8e9c77e 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1585,24 +1585,36 @@ pub(crate) fn emit( } => { let rex = RexFlags::clear_w(); let (prefix, opcode) = match op { + SseOpcode::Addps => (LegacyPrefix::None, 0x0F58), + SseOpcode::Addpd => (LegacyPrefix::_66, 0x0F58), SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58), SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58), SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54), SseOpcode::Andps => (LegacyPrefix::None, 0x0F54), SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55), SseOpcode::Andnpd => (LegacyPrefix::_66, 0x0F55), + SseOpcode::Divps => (LegacyPrefix::None, 0x0F5E), + SseOpcode::Divpd => (LegacyPrefix::_66, 0x0F5E), + SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E), + SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E), + SseOpcode::Minps => (LegacyPrefix::None, 0x0F5D), + SseOpcode::Minpd => (LegacyPrefix::_66, 0x0F5D), + SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D), + SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D), + SseOpcode::Maxps => (LegacyPrefix::None, 0x0F5F), + SseOpcode::Maxpd => (LegacyPrefix::_66, 0x0F5F), + SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F), + SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F), + SseOpcode::Mulps => (LegacyPrefix::None, 0x0F59), + SseOpcode::Mulpd => (LegacyPrefix::_66, 0x0F59), SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59), SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59), SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56), SseOpcode::Orps => (LegacyPrefix::None, 0x0F56), + SseOpcode::Subps => (LegacyPrefix::None, 0x0F5C), + SseOpcode::Subpd => (LegacyPrefix::_66, 0x0F5C), SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C), SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C), - SseOpcode::Minss => (LegacyPrefix::_F3, 0x0F5D), - SseOpcode::Minsd => (LegacyPrefix::_F2, 0x0F5D), - SseOpcode::Divss => (LegacyPrefix::_F3, 0x0F5E), - SseOpcode::Divsd => (LegacyPrefix::_F2, 0x0F5E), - SseOpcode::Maxss => (LegacyPrefix::_F3, 0x0F5F), - SseOpcode::Maxsd => (LegacyPrefix::_F2, 0x0F5F), SseOpcode::Xorps => (LegacyPrefix::None, 0x0F57), SseOpcode::Xorpd => (LegacyPrefix::_66, 0x0F57), _ => unimplemented!("Opcode {:?} not implemented", op), diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index dadab97d94..1be2c53d86 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -51,14 +51,6 @@ fn is_bool_ty(ty: Type) -> bool { } } -fn is_float_ty(ty: Type) -> bool { - match ty { - types::F32 | types::F64 => true, - types::R32 => panic!("shouldn't have 32-bits refs on x64"), - _ => false, - } -} - fn int_ty_is_64(ty: Type) -> bool { match ty { types::I8 | types::I16 | types::I32 => false, @@ -67,14 +59,6 @@ fn int_ty_is_64(ty: Type) -> bool { } } -fn flt_ty_is_64(ty: Type) -> bool { - match ty { - types::F32 => false, - types::F64 => true, - _ => panic!("type {} is none of F32, F64", ty), - } -} - fn iri_to_u64_imm(ctx: Ctx, inst: IRInst) -> Option { ctx.get_constant(inst) } @@ -1081,32 +1065,54 @@ fn lower_insn_to_regs>( } Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => { - let lhs = input_to_reg_mem(ctx, inputs[0]); - let rhs = input_to_reg(ctx, inputs[1]); + let lhs = input_to_reg(ctx, inputs[0]); + let rhs = input_to_reg_mem(ctx, inputs[1]); let dst = output_to_reg(ctx, outputs[0]); + let ty = ty.unwrap(); + + // Move the `lhs` to the same register as `dst`; this may not emit an actual move + // but ensures that the registers are the same to match x86's read-write operand + // encoding. + ctx.emit(Inst::gen_move(dst, lhs, ty)); // Note: min and max can't be handled here, because of the way Cranelift defines them: // if any operand is a NaN, they must return the NaN operand, while the x86 machine - // instruction will return the other operand. - let (f32_op, f64_op) = match op { - Opcode::Fadd => (SseOpcode::Addss, SseOpcode::Addsd), - Opcode::Fsub => (SseOpcode::Subss, SseOpcode::Subsd), - Opcode::Fmul => (SseOpcode::Mulss, SseOpcode::Mulsd), - Opcode::Fdiv => (SseOpcode::Divss, SseOpcode::Divsd), - _ => unreachable!(), + // instruction will return the second operand if either operand is a NaN. + let sse_op = match ty { + types::F32 => match op { + Opcode::Fadd => SseOpcode::Addss, + Opcode::Fsub => SseOpcode::Subss, + Opcode::Fmul => SseOpcode::Mulss, + Opcode::Fdiv => SseOpcode::Divss, + _ => unreachable!(), + }, + types::F64 => match op { + Opcode::Fadd => SseOpcode::Addsd, + Opcode::Fsub => SseOpcode::Subsd, + Opcode::Fmul => SseOpcode::Mulsd, + Opcode::Fdiv => SseOpcode::Divsd, + _ => unreachable!(), + }, + types::F32X4 => match op { + Opcode::Fadd => SseOpcode::Addps, + Opcode::Fsub => SseOpcode::Subps, + Opcode::Fmul => SseOpcode::Mulps, + Opcode::Fdiv => SseOpcode::Divps, + _ => unreachable!(), + }, + types::F64X2 => match op { + Opcode::Fadd => SseOpcode::Addpd, + Opcode::Fsub => SseOpcode::Subpd, + Opcode::Fmul => SseOpcode::Mulpd, + Opcode::Fdiv => SseOpcode::Divpd, + _ => unreachable!(), + }, + _ => panic!( + "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}", + ty + ), }; - - let is_64 = flt_ty_is_64(ty.unwrap()); - - let mov_op = if is_64 { - SseOpcode::Movsd - } else { - SseOpcode::Movss - }; - ctx.emit(Inst::xmm_mov(mov_op, lhs, dst, None)); - - let sse_op = if is_64 { f64_op } else { f32_op }; - ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::reg(rhs), dst)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); } Opcode::Fmin | Opcode::Fmax => { @@ -1127,17 +1133,19 @@ fn lower_insn_to_regs>( Opcode::Sqrt => { let src = input_to_reg_mem(ctx, inputs[0]); let dst = output_to_reg(ctx, outputs[0]); + let ty = ty.unwrap(); - let (f32_op, f64_op) = match op { - Opcode::Sqrt => (SseOpcode::Sqrtss, SseOpcode::Sqrtsd), - _ => unreachable!(), + let sse_op = match ty { + types::F32 => SseOpcode::Sqrtss, + types::F64 => SseOpcode::Sqrtsd, + types::F32X4 => SseOpcode::Sqrtps, + types::F64X2 => SseOpcode::Sqrtpd, + _ => panic!( + "invalid type: expected one of [F32, F64, F32X4, F64X2], found {}", + ty + ), }; - let sse_op = if flt_ty_is_64(ty.unwrap()) { - f64_op - } else { - f32_op - }; ctx.emit(Inst::xmm_unary_rm_r(sse_op, src, dst)); }