ISLE: port more ops on x64 to lowering patterns. (#3855)

This commit is contained in:
Chris Fallin
2022-02-28 13:28:42 -08:00
committed by GitHub
parent 90a081a731
commit d9dfc44c32
6 changed files with 777 additions and 270 deletions

View File

@@ -1861,6 +1861,86 @@
(rule (pandn src1 src2)
(xmm_rm_r $F64X2 (SseOpcode.Pandn) src1 src2))
;; Helper for creating `addss` instructions.
(decl addss (Xmm XmmMem) Xmm)
(rule (addss src1 src2)
(xmm_rm_r $F32 (SseOpcode.Addss) src1 src2))
;; Helper for creating `addsd` instructions.
(decl addsd (Xmm XmmMem) Xmm)
(rule (addsd src1 src2)
(xmm_rm_r $F64 (SseOpcode.Addsd) src1 src2))
;; Helper for creating `addps` instructions.
(decl addps (Xmm XmmMem) Xmm)
(rule (addps src1 src2)
(xmm_rm_r $F32 (SseOpcode.Addps) src1 src2))
;; Helper for creating `addpd` instructions.
(decl addpd (Xmm XmmMem) Xmm)
(rule (addpd src1 src2)
(xmm_rm_r $F32 (SseOpcode.Addpd) src1 src2))
;; Helper for creating `subss` instructions.
(decl subss (Xmm XmmMem) Xmm)
(rule (subss src1 src2)
(xmm_rm_r $F32 (SseOpcode.Subss) src1 src2))
;; Helper for creating `subsd` instructions.
(decl subsd (Xmm XmmMem) Xmm)
(rule (subsd src1 src2)
(xmm_rm_r $F64 (SseOpcode.Subsd) src1 src2))
;; Helper for creating `subps` instructions.
(decl subps (Xmm XmmMem) Xmm)
(rule (subps src1 src2)
(xmm_rm_r $F32 (SseOpcode.Subps) src1 src2))
;; Helper for creating `subpd` instructions.
(decl subpd (Xmm XmmMem) Xmm)
(rule (subpd src1 src2)
(xmm_rm_r $F32 (SseOpcode.Subpd) src1 src2))
;; Helper for creating `mulss` instructions.
(decl mulss (Xmm XmmMem) Xmm)
(rule (mulss src1 src2)
(xmm_rm_r $F32 (SseOpcode.Mulss) src1 src2))
;; Helper for creating `mulsd` instructions.
(decl mulsd (Xmm XmmMem) Xmm)
(rule (mulsd src1 src2)
(xmm_rm_r $F64 (SseOpcode.Mulsd) src1 src2))
;; Helper for creating `mulps` instructions.
(decl mulps (Xmm XmmMem) Xmm)
(rule (mulps src1 src2)
(xmm_rm_r $F32 (SseOpcode.Mulps) src1 src2))
;; Helper for creating `mulpd` instructions.
(decl mulpd (Xmm XmmMem) Xmm)
(rule (mulpd src1 src2)
(xmm_rm_r $F32 (SseOpcode.Mulpd) src1 src2))
;; Helper for creating `divss` instructions.
(decl divss (Xmm XmmMem) Xmm)
(rule (divss src1 src2)
(xmm_rm_r $F32 (SseOpcode.Divss) src1 src2))
;; Helper for creating `divsd` instructions.
(decl divsd (Xmm XmmMem) Xmm)
(rule (divsd src1 src2)
(xmm_rm_r $F64 (SseOpcode.Divsd) src1 src2))
;; Helper for creating `divps` instructions.
(decl divps (Xmm XmmMem) Xmm)
(rule (divps src1 src2)
(xmm_rm_r $F32 (SseOpcode.Divps) src1 src2))
;; Helper for creating `divpd` instructions.
(decl divpd (Xmm XmmMem) Xmm)
(rule (divpd src1 src2)
(xmm_rm_r $F32 (SseOpcode.Divpd) src1 src2))
(decl sse_blend_op (Type) SseOpcode)
(rule (sse_blend_op $F32X4) (SseOpcode.Blendvps))
(rule (sse_blend_op $F64X2) (SseOpcode.Blendvpd))
@@ -2041,6 +2121,16 @@
lane
size))
;; Helper for creating `pmaddwd` instructions.
(decl pmaddwd (Xmm XmmMem) Xmm)
(rule (pmaddwd src1 src2)
(let ((dst WritableXmm (temp_writable_xmm))
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Pmaddwd)
src1
src2
dst))))
dst))
;; Helper for creating `insertps` instructions.
(decl insertps (Xmm XmmMem u8) Xmm)
(rule (insertps src1 src2 lane)
@@ -2271,6 +2361,11 @@
(rule (ud2 code)
(SideEffectNoResult.Inst (MInst.Ud2 code)))
;; Helper for creating `hlt` instructions.
(decl hlt () SideEffectNoResult)
(rule (hlt)
(SideEffectNoResult.Inst (MInst.Hlt)))
;; Helper for creating `lzcnt` instructions.
(decl lzcnt (Type Gpr) Gpr)
(rule (lzcnt ty src)

View File

@@ -1986,3 +1986,76 @@
(rule (lower (has_type (fits_in_64 ty) (breduce src)))
(value_regs_get_gpr src 0))
;; Rules for `bint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Booleans are stored as all-zeroes (0) or all-ones (-1). We AND out
;; the LSB to give a 0 / 1-valued integer result.
(rule (lower (has_type (fits_in_64 ty)
(bint src)))
(x64_and ty src (RegMemImm.Imm 1)))
(rule (lower (has_type $I128
(bint src)))
(value_regs
(x64_and $I64 src (RegMemImm.Imm 1))
(imm $I64 0)))
;; Rules for `debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (debugtrap))
(side_effect (hlt)))
;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $I32X4
(widening_pairwise_dot_product_s x y)))
(pmaddwd x y))
;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; N.B.: there are no load-op merging rules here. We can't guarantee
;; the RHS (if a load) is 128-bit aligned, so we must avoid merging a
;; load. Likewise for other ops below.
(rule (lower (has_type $F32 (fadd x y)))
(addss x y))
(rule (lower (has_type $F64 (fadd x y)))
(addsd x y))
(rule (lower (has_type $F32X4 (fadd x y)))
(addps x y))
(rule (lower (has_type $F64X2 (fadd x y)))
(addpd x y))
;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32 (fsub x y)))
(subss x y))
(rule (lower (has_type $F64 (fsub x y)))
(subsd x y))
(rule (lower (has_type $F32X4 (fsub x y)))
(subps x y))
(rule (lower (has_type $F64X2 (fsub x y)))
(subpd x y))
;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32 (fmul x y)))
(mulss x y))
(rule (lower (has_type $F64 (fmul x y)))
(mulsd x y))
(rule (lower (has_type $F32X4 (fmul x y)))
(mulps x y))
(rule (lower (has_type $F64X2 (fmul x y)))
(mulpd x y))
;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32 (fdiv x y)))
(divss x y))
(rule (lower (has_type $F64 (fdiv x y)))
(divsd x y))
(rule (lower (has_type $F32X4 (fdiv x y)))
(divps x y))
(rule (lower (has_type $F64X2 (fdiv x y)))
(divpd x y))

View File

@@ -903,33 +903,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::Sextend
| Opcode::Breduce
| Opcode::Bextend
| Opcode::Ireduce => implemented_in_isle(ctx),
Opcode::Bint => {
// Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
// out the LSB to give a 0 / 1-valued integer result.
let rn = put_input_in_reg(ctx, inputs[0]);
let rd = get_output_reg(ctx, outputs[0]);
let ty = ctx.output_ty(insn, 0);
ctx.emit(Inst::gen_move(rd.regs()[0], rn, types::I64));
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::And,
RegMemImm::imm(1),
rd.regs()[0],
));
if ty == types::I128 {
let upper = rd.regs()[1];
ctx.emit(Inst::alu_rmi_r(
OperandSize::Size64,
AluRmiROpcode::Xor,
RegMemImm::reg(upper.to_reg()),
upper,
));
}
}
| Opcode::Ireduce
| Opcode::Bint
| Opcode::Debugtrap
| Opcode::WideningPairwiseDotProductS
| Opcode::Fadd
| Opcode::Fsub
| Opcode::Fmul
| Opcode::Fdiv => implemented_in_isle(ctx),
Opcode::Icmp => {
let condcode = ctx.data(insn).cond_code().unwrap();
@@ -1240,10 +1221,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
abi.emit_stack_post_adjust(ctx);
}
Opcode::Debugtrap => {
ctx.emit(Inst::Hlt);
}
Opcode::Trapif | Opcode::Trapff => {
let trap_code = ctx.data(insn).trap_code().unwrap();
@@ -1301,77 +1278,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
};
}
Opcode::WideningPairwiseDotProductS => {
let lhs = put_input_in_reg(ctx, inputs[0]);
let rhs = input_to_reg_mem(ctx, inputs[1]);
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ty = ty.unwrap();
ctx.emit(Inst::gen_move(dst, lhs, ty));
if ty == types::I32X4 {
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddwd, rhs, dst));
} else {
panic!(
"Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
ty
);
}
}
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
let lhs = put_input_in_reg(ctx, inputs[0]);
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
// must avoid merging a load here.
let rhs = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let ty = ty.unwrap();
// Move the `lhs` to the same register as `dst`; this may not emit an actual move
// but ensures that the registers are the same to match x86's read-write operand
// encoding.
ctx.emit(Inst::gen_move(dst, lhs, ty));
// Note: min and max can't be handled here, because of the way Cranelift defines them:
// if any operand is a NaN, they must return the NaN operand, while the x86 machine
// instruction will return the second operand if either operand is a NaN.
let sse_op = match ty {
types::F32 => match op {
Opcode::Fadd => SseOpcode::Addss,
Opcode::Fsub => SseOpcode::Subss,
Opcode::Fmul => SseOpcode::Mulss,
Opcode::Fdiv => SseOpcode::Divss,
_ => unreachable!(),
},
types::F64 => match op {
Opcode::Fadd => SseOpcode::Addsd,
Opcode::Fsub => SseOpcode::Subsd,
Opcode::Fmul => SseOpcode::Mulsd,
Opcode::Fdiv => SseOpcode::Divsd,
_ => unreachable!(),
},
types::F32X4 => match op {
Opcode::Fadd => SseOpcode::Addps,
Opcode::Fsub => SseOpcode::Subps,
Opcode::Fmul => SseOpcode::Mulps,
Opcode::Fdiv => SseOpcode::Divps,
_ => unreachable!(),
},
types::F64X2 => match op {
Opcode::Fadd => SseOpcode::Addpd,
Opcode::Fsub => SseOpcode::Subpd,
Opcode::Fmul => SseOpcode::Mulpd,
Opcode::Fdiv => SseOpcode::Divpd,
_ => unreachable!(),
},
_ => panic!(
"invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
ty
),
};
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
}
Opcode::Fmin | Opcode::Fmax => {
let lhs = put_input_in_reg(ctx, inputs[0]);
let rhs = put_input_in_reg(ctx, inputs[1]);

View File

@@ -1,4 +1,4 @@
src/clif.isle 9ea75a6f790b5c03
src/prelude.isle b2bc986bcbbbb77
src/isa/x64/inst.isle 67eb719e568c2a81
src/isa/x64/lower.isle 2d06b233fb3a1e1c
src/isa/x64/inst.isle 9a8a3babd8257100
src/isa/x64/lower.isle f0f4af691241209e

File diff suppressed because it is too large Load Diff