ISLE: port more ops on x64 to lowering patterns. (#3855)
This commit is contained in:
@@ -1861,6 +1861,86 @@
|
||||
(rule (pandn src1 src2)
|
||||
(xmm_rm_r $F64X2 (SseOpcode.Pandn) src1 src2))
|
||||
|
||||
;; Helper for creating `addss` instructions.
|
||||
(decl addss (Xmm XmmMem) Xmm)
|
||||
(rule (addss src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Addss) src1 src2))
|
||||
|
||||
;; Helper for creating `addsd` instructions.
|
||||
(decl addsd (Xmm XmmMem) Xmm)
|
||||
(rule (addsd src1 src2)
|
||||
(xmm_rm_r $F64 (SseOpcode.Addsd) src1 src2))
|
||||
|
||||
;; Helper for creating `addps` instructions.
|
||||
(decl addps (Xmm XmmMem) Xmm)
|
||||
(rule (addps src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Addps) src1 src2))
|
||||
|
||||
;; Helper for creating `addpd` instructions.
|
||||
(decl addpd (Xmm XmmMem) Xmm)
|
||||
(rule (addpd src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Addpd) src1 src2))
|
||||
|
||||
;; Helper for creating `subss` instructions.
|
||||
(decl subss (Xmm XmmMem) Xmm)
|
||||
(rule (subss src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Subss) src1 src2))
|
||||
|
||||
;; Helper for creating `subsd` instructions.
|
||||
(decl subsd (Xmm XmmMem) Xmm)
|
||||
(rule (subsd src1 src2)
|
||||
(xmm_rm_r $F64 (SseOpcode.Subsd) src1 src2))
|
||||
|
||||
;; Helper for creating `subps` instructions.
|
||||
(decl subps (Xmm XmmMem) Xmm)
|
||||
(rule (subps src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Subps) src1 src2))
|
||||
|
||||
;; Helper for creating `subpd` instructions.
|
||||
(decl subpd (Xmm XmmMem) Xmm)
|
||||
(rule (subpd src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Subpd) src1 src2))
|
||||
|
||||
;; Helper for creating `mulss` instructions.
|
||||
(decl mulss (Xmm XmmMem) Xmm)
|
||||
(rule (mulss src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Mulss) src1 src2))
|
||||
|
||||
;; Helper for creating `mulsd` instructions.
|
||||
(decl mulsd (Xmm XmmMem) Xmm)
|
||||
(rule (mulsd src1 src2)
|
||||
(xmm_rm_r $F64 (SseOpcode.Mulsd) src1 src2))
|
||||
|
||||
;; Helper for creating `mulps` instructions.
|
||||
(decl mulps (Xmm XmmMem) Xmm)
|
||||
(rule (mulps src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Mulps) src1 src2))
|
||||
|
||||
;; Helper for creating `mulpd` instructions.
|
||||
(decl mulpd (Xmm XmmMem) Xmm)
|
||||
(rule (mulpd src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Mulpd) src1 src2))
|
||||
|
||||
;; Helper for creating `divss` instructions.
|
||||
(decl divss (Xmm XmmMem) Xmm)
|
||||
(rule (divss src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Divss) src1 src2))
|
||||
|
||||
;; Helper for creating `divsd` instructions.
|
||||
(decl divsd (Xmm XmmMem) Xmm)
|
||||
(rule (divsd src1 src2)
|
||||
(xmm_rm_r $F64 (SseOpcode.Divsd) src1 src2))
|
||||
|
||||
;; Helper for creating `divps` instructions.
|
||||
(decl divps (Xmm XmmMem) Xmm)
|
||||
(rule (divps src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Divps) src1 src2))
|
||||
|
||||
;; Helper for creating `divpd` instructions.
|
||||
(decl divpd (Xmm XmmMem) Xmm)
|
||||
(rule (divpd src1 src2)
|
||||
(xmm_rm_r $F32 (SseOpcode.Divpd) src1 src2))
|
||||
|
||||
(decl sse_blend_op (Type) SseOpcode)
|
||||
(rule (sse_blend_op $F32X4) (SseOpcode.Blendvps))
|
||||
(rule (sse_blend_op $F64X2) (SseOpcode.Blendvpd))
|
||||
@@ -2041,6 +2121,16 @@
|
||||
lane
|
||||
size))
|
||||
|
||||
;; Helper for creating `pmaddwd` instructions.
|
||||
(decl pmaddwd (Xmm XmmMem) Xmm)
|
||||
(rule (pmaddwd src1 src2)
|
||||
(let ((dst WritableXmm (temp_writable_xmm))
|
||||
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Pmaddwd)
|
||||
src1
|
||||
src2
|
||||
dst))))
|
||||
dst))
|
||||
|
||||
;; Helper for creating `insertps` instructions.
|
||||
(decl insertps (Xmm XmmMem u8) Xmm)
|
||||
(rule (insertps src1 src2 lane)
|
||||
@@ -2271,6 +2361,11 @@
|
||||
(rule (ud2 code)
|
||||
(SideEffectNoResult.Inst (MInst.Ud2 code)))
|
||||
|
||||
;; Helper for creating `hlt` instructions.
|
||||
(decl hlt () SideEffectNoResult)
|
||||
(rule (hlt)
|
||||
(SideEffectNoResult.Inst (MInst.Hlt)))
|
||||
|
||||
;; Helper for creating `lzcnt` instructions.
|
||||
(decl lzcnt (Type Gpr) Gpr)
|
||||
(rule (lzcnt ty src)
|
||||
|
||||
@@ -1986,3 +1986,76 @@
|
||||
|
||||
(rule (lower (has_type (fits_in_64 ty) (breduce src)))
|
||||
(value_regs_get_gpr src 0))
|
||||
|
||||
;; Rules for `bint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Booleans are stored as all-zeroes (0) or all-ones (-1). We AND out
|
||||
;; the LSB to give a 0 / 1-valued integer result.
|
||||
|
||||
(rule (lower (has_type (fits_in_64 ty)
|
||||
(bint src)))
|
||||
(x64_and ty src (RegMemImm.Imm 1)))
|
||||
(rule (lower (has_type $I128
|
||||
(bint src)))
|
||||
(value_regs
|
||||
(x64_and $I64 src (RegMemImm.Imm 1))
|
||||
(imm $I64 0)))
|
||||
|
||||
;; Rules for `debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (debugtrap))
|
||||
(side_effect (hlt)))
|
||||
|
||||
;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $I32X4
|
||||
(widening_pairwise_dot_product_s x y)))
|
||||
(pmaddwd x y))
|
||||
|
||||
;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; N.B.: there are no load-op merging rules here. We can't guarantee
|
||||
;; the RHS (if a load) is 128-bit aligned, so we must avoid merging a
|
||||
;; load. Likewise for other ops below.
|
||||
|
||||
(rule (lower (has_type $F32 (fadd x y)))
|
||||
(addss x y))
|
||||
(rule (lower (has_type $F64 (fadd x y)))
|
||||
(addsd x y))
|
||||
(rule (lower (has_type $F32X4 (fadd x y)))
|
||||
(addps x y))
|
||||
(rule (lower (has_type $F64X2 (fadd x y)))
|
||||
(addpd x y))
|
||||
|
||||
;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $F32 (fsub x y)))
|
||||
(subss x y))
|
||||
(rule (lower (has_type $F64 (fsub x y)))
|
||||
(subsd x y))
|
||||
(rule (lower (has_type $F32X4 (fsub x y)))
|
||||
(subps x y))
|
||||
(rule (lower (has_type $F64X2 (fsub x y)))
|
||||
(subpd x y))
|
||||
|
||||
;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $F32 (fmul x y)))
|
||||
(mulss x y))
|
||||
(rule (lower (has_type $F64 (fmul x y)))
|
||||
(mulsd x y))
|
||||
(rule (lower (has_type $F32X4 (fmul x y)))
|
||||
(mulps x y))
|
||||
(rule (lower (has_type $F64X2 (fmul x y)))
|
||||
(mulpd x y))
|
||||
|
||||
;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(rule (lower (has_type $F32 (fdiv x y)))
|
||||
(divss x y))
|
||||
(rule (lower (has_type $F64 (fdiv x y)))
|
||||
(divsd x y))
|
||||
(rule (lower (has_type $F32X4 (fdiv x y)))
|
||||
(divps x y))
|
||||
(rule (lower (has_type $F64X2 (fdiv x y)))
|
||||
(divpd x y))
|
||||
|
||||
@@ -903,33 +903,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
| Opcode::Sextend
|
||||
| Opcode::Breduce
|
||||
| Opcode::Bextend
|
||||
| Opcode::Ireduce => implemented_in_isle(ctx),
|
||||
|
||||
Opcode::Bint => {
|
||||
// Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
|
||||
// out the LSB to give a 0 / 1-valued integer result.
|
||||
let rn = put_input_in_reg(ctx, inputs[0]);
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
let ty = ctx.output_ty(insn, 0);
|
||||
|
||||
ctx.emit(Inst::gen_move(rd.regs()[0], rn, types::I64));
|
||||
ctx.emit(Inst::alu_rmi_r(
|
||||
OperandSize::Size64,
|
||||
AluRmiROpcode::And,
|
||||
RegMemImm::imm(1),
|
||||
rd.regs()[0],
|
||||
));
|
||||
|
||||
if ty == types::I128 {
|
||||
let upper = rd.regs()[1];
|
||||
ctx.emit(Inst::alu_rmi_r(
|
||||
OperandSize::Size64,
|
||||
AluRmiROpcode::Xor,
|
||||
RegMemImm::reg(upper.to_reg()),
|
||||
upper,
|
||||
));
|
||||
}
|
||||
}
|
||||
| Opcode::Ireduce
|
||||
| Opcode::Bint
|
||||
| Opcode::Debugtrap
|
||||
| Opcode::WideningPairwiseDotProductS
|
||||
| Opcode::Fadd
|
||||
| Opcode::Fsub
|
||||
| Opcode::Fmul
|
||||
| Opcode::Fdiv => implemented_in_isle(ctx),
|
||||
|
||||
Opcode::Icmp => {
|
||||
let condcode = ctx.data(insn).cond_code().unwrap();
|
||||
@@ -1240,10 +1221,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
abi.emit_stack_post_adjust(ctx);
|
||||
}
|
||||
|
||||
Opcode::Debugtrap => {
|
||||
ctx.emit(Inst::Hlt);
|
||||
}
|
||||
|
||||
Opcode::Trapif | Opcode::Trapff => {
|
||||
let trap_code = ctx.data(insn).trap_code().unwrap();
|
||||
|
||||
@@ -1301,77 +1278,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
};
|
||||
}
|
||||
|
||||
Opcode::WideningPairwiseDotProductS => {
|
||||
let lhs = put_input_in_reg(ctx, inputs[0]);
|
||||
let rhs = input_to_reg_mem(ctx, inputs[1]);
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let ty = ty.unwrap();
|
||||
|
||||
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
||||
|
||||
if ty == types::I32X4 {
|
||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddwd, rhs, dst));
|
||||
} else {
|
||||
panic!(
|
||||
"Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
|
||||
ty
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
|
||||
let lhs = put_input_in_reg(ctx, inputs[0]);
|
||||
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
|
||||
// must avoid merging a load here.
|
||||
let rhs = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let ty = ty.unwrap();
|
||||
|
||||
// Move the `lhs` to the same register as `dst`; this may not emit an actual move
|
||||
// but ensures that the registers are the same to match x86's read-write operand
|
||||
// encoding.
|
||||
ctx.emit(Inst::gen_move(dst, lhs, ty));
|
||||
|
||||
// Note: min and max can't be handled here, because of the way Cranelift defines them:
|
||||
// if any operand is a NaN, they must return the NaN operand, while the x86 machine
|
||||
// instruction will return the second operand if either operand is a NaN.
|
||||
let sse_op = match ty {
|
||||
types::F32 => match op {
|
||||
Opcode::Fadd => SseOpcode::Addss,
|
||||
Opcode::Fsub => SseOpcode::Subss,
|
||||
Opcode::Fmul => SseOpcode::Mulss,
|
||||
Opcode::Fdiv => SseOpcode::Divss,
|
||||
_ => unreachable!(),
|
||||
},
|
||||
types::F64 => match op {
|
||||
Opcode::Fadd => SseOpcode::Addsd,
|
||||
Opcode::Fsub => SseOpcode::Subsd,
|
||||
Opcode::Fmul => SseOpcode::Mulsd,
|
||||
Opcode::Fdiv => SseOpcode::Divsd,
|
||||
_ => unreachable!(),
|
||||
},
|
||||
types::F32X4 => match op {
|
||||
Opcode::Fadd => SseOpcode::Addps,
|
||||
Opcode::Fsub => SseOpcode::Subps,
|
||||
Opcode::Fmul => SseOpcode::Mulps,
|
||||
Opcode::Fdiv => SseOpcode::Divps,
|
||||
_ => unreachable!(),
|
||||
},
|
||||
types::F64X2 => match op {
|
||||
Opcode::Fadd => SseOpcode::Addpd,
|
||||
Opcode::Fsub => SseOpcode::Subpd,
|
||||
Opcode::Fmul => SseOpcode::Mulpd,
|
||||
Opcode::Fdiv => SseOpcode::Divpd,
|
||||
_ => unreachable!(),
|
||||
},
|
||||
_ => panic!(
|
||||
"invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
|
||||
ty
|
||||
),
|
||||
};
|
||||
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
|
||||
}
|
||||
|
||||
Opcode::Fmin | Opcode::Fmax => {
|
||||
let lhs = put_input_in_reg(ctx, inputs[0]);
|
||||
let rhs = put_input_in_reg(ctx, inputs[1]);
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
src/clif.isle 9ea75a6f790b5c03
|
||||
src/prelude.isle b2bc986bcbbbb77
|
||||
src/isa/x64/inst.isle 67eb719e568c2a81
|
||||
src/isa/x64/lower.isle 2d06b233fb3a1e1c
|
||||
src/isa/x64/inst.isle 9a8a3babd8257100
|
||||
src/isa/x64/lower.isle f0f4af691241209e
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user