Convert sqrt..nearest to ISLE (AArch64) (#4508)

Converted the existing implementations for the following opcodes to ISLE
on AArch64:
- `sqrt`
- `fneg`
- `fabs`
- `fpromote`
- `fdemote`
- `ceil`
- `floor`
- `trunc`
- `nearest`

Copyright (c) 2022 Arm Limited
This commit is contained in:
Damian Heaton
2022-07-22 22:48:07 +01:00
committed by GitHub
parent 4720d09651
commit f1a0c40a53
5 changed files with 336 additions and 99 deletions

View File

@@ -1464,6 +1464,13 @@
(_ Unit (emit (MInst.VecRRR op dst src1 src2 size))))
dst))
;; Helper for emitting `MInst.FpuRR` instructions.
(decl fpu_rr (FPUOp1 Reg ScalarSize) Reg)
(rule (fpu_rr op src size)
(let ((dst WritableReg (temp_writable_reg $F64))
(_ Unit (emit (MInst.FpuRR op size dst src))))
dst))
;; Helper for emitting `MInst.FpuRRR` instructions.
(decl fpu_rrr (FPUOp2 Reg Reg ScalarSize) Reg)
(rule (fpu_rrr op src1 src2 size)
@@ -1644,6 +1651,12 @@
(MInst.FpuCSel64 dst if_true if_false cond)
dst)))
;; Helper for emitting `MInst.FpuRound` instructions.
(decl fpu_round (FpuRoundMode Reg) Reg)
(rule (fpu_round op rn)
(let ((dst WritableReg (temp_writable_reg $F64))
(_ Unit (emit (MInst.FpuRound op dst rn))))
dst))
;; Helper for emitting `MInst.MovToFpu` instructions.
(decl mov_to_fpu (Reg ScalarSize) Reg)

View File

@@ -4426,6 +4426,17 @@ fn test_aarch64_binemit() {
"abs v1.2d, v10.2d",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Fabs,
rd: writable_vreg(15),
rn: vreg(16),
size: VectorSize::Size32x2,
},
"0FFAA00E",
"fabs v15.2s, v16.2s",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Fabs,
@@ -4448,6 +4459,17 @@ fn test_aarch64_binemit() {
"fabs v3.2d, v22.2d",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Fneg,
rd: writable_vreg(31),
rn: vreg(0),
size: VectorSize::Size32x2,
},
"1FF8A02E",
"fneg v31.2s, v0.2s",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Fneg,
@@ -4481,6 +4503,17 @@ fn test_aarch64_binemit() {
"fsqrt v18.2s, v25.2s",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Fsqrt,
rd: writable_vreg(18),
rn: vreg(25),
size: VectorSize::Size32x4,
},
"32FBA16E",
"fsqrt v18.4s, v25.4s",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Fsqrt,

View File

@@ -230,6 +230,85 @@
(with_flags (fpu_cmp (scalar_size ty) rn rm)
(fpu_csel ty (Cond.Gt) rn rm)))
;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (sqrt x)))
(vec_misc (VecMisc2.Fsqrt) x (vector_size ty)))
(rule (lower (has_type (ty_scalar_float ty) (sqrt x)))
(fpu_rr (FPUOp1.Sqrt) x (scalar_size ty)))
;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (fneg x)))
(vec_misc (VecMisc2.Fneg) x (vector_size ty)))
(rule (lower (has_type (ty_scalar_float ty) (fneg x)))
(fpu_rr (FPUOp1.Neg) x (scalar_size ty)))
;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (fabs x)))
(vec_misc (VecMisc2.Fabs) x (vector_size ty)))
(rule (lower (has_type (ty_scalar_float ty) (fabs x)))
(fpu_rr (FPUOp1.Abs) x (scalar_size ty)))
;;;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F64 (fpromote x)))
(fpu_rr (FPUOp1.Cvt32To64) x (ScalarSize.Size32)))
;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32 (fdemote x)))
(fpu_rr (FPUOp1.Cvt64To32) x (ScalarSize.Size64)))
;;;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (ceil x)))
(vec_misc (VecMisc2.Frintp) x (vector_size ty)))
(rule (lower (has_type $F32 (ceil x)))
(fpu_round (FpuRoundMode.Plus32) x))
(rule (lower (has_type $F64 (ceil x)))
(fpu_round (FpuRoundMode.Plus64) x))
;;;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (floor x)))
(vec_misc (VecMisc2.Frintm) x (vector_size ty)))
(rule (lower (has_type $F32 (floor x)))
(fpu_round (FpuRoundMode.Minus32) x))
(rule (lower (has_type $F64 (floor x)))
(fpu_round (FpuRoundMode.Minus64) x))
;;;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (trunc x)))
(vec_misc (VecMisc2.Frintz) x (vector_size ty)))
(rule (lower (has_type $F32 (trunc x)))
(fpu_round (FpuRoundMode.Zero32) x))
(rule (lower (has_type $F64 (trunc x)))
(fpu_round (FpuRoundMode.Zero64) x))
;;;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (nearest x)))
(vec_misc (VecMisc2.Frintn) x (vector_size ty)))
(rule (lower (has_type $F32 (nearest x)))
(fpu_round (FpuRoundMode.Nearest32) x))
(rule (lower (has_type $F64 (nearest x)))
(fpu_round (FpuRoundMode.Nearest64) x))
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; `i64` and smaller

View File

@@ -1250,107 +1250,10 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::FminPseudo | Opcode::FmaxPseudo => implemented_in_isle(ctx),
Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
let ty = ty.unwrap();
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
if !ty.is_vector() {
let fpu_op = match op {
Opcode::Sqrt => FPUOp1::Sqrt,
Opcode::Fneg => FPUOp1::Neg,
Opcode::Fabs => FPUOp1::Abs,
Opcode::Fpromote => {
if ty != F64 {
return Err(CodegenError::Unsupported(format!(
"Fpromote: Unsupported type: {:?}",
ty
)));
}
FPUOp1::Cvt32To64
}
Opcode::Fdemote => {
if ty != F32 {
return Err(CodegenError::Unsupported(format!(
"Fdemote: Unsupported type: {:?}",
ty
)));
}
FPUOp1::Cvt64To32
}
_ => unreachable!(),
};
ctx.emit(Inst::FpuRR {
fpu_op,
size: ScalarSize::from_ty(ctx.input_ty(insn, 0)),
rd,
rn,
});
} else {
let op = match op {
Opcode::Fabs => VecMisc2::Fabs,
Opcode::Fneg => VecMisc2::Fneg,
Opcode::Sqrt => VecMisc2::Fsqrt,
_ => {
return Err(CodegenError::Unsupported(format!(
"{}: Unsupported type: {:?}",
op, ty
)))
}
};
ctx.emit(Inst::VecMisc {
op,
rd,
rn,
size: VectorSize::from_ty(ty),
});
}
implemented_in_isle(ctx)
}
Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
let ty = ctx.output_ty(insn, 0);
if !ty.is_vector() {
let bits = ty_bits(ty);
let op = match (op, bits) {
(Opcode::Ceil, 32) => FpuRoundMode::Plus32,
(Opcode::Ceil, 64) => FpuRoundMode::Plus64,
(Opcode::Floor, 32) => FpuRoundMode::Minus32,
(Opcode::Floor, 64) => FpuRoundMode::Minus64,
(Opcode::Trunc, 32) => FpuRoundMode::Zero32,
(Opcode::Trunc, 64) => FpuRoundMode::Zero64,
(Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
(Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
_ => {
return Err(CodegenError::Unsupported(format!(
"{}: Unsupported type: {:?}",
op, ty
)))
}
};
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::FpuRound { op, rd, rn });
} else {
let (op, size) = match (op, ty) {
(Opcode::Ceil, F32X4) => (VecMisc2::Frintp, VectorSize::Size32x4),
(Opcode::Ceil, F64X2) => (VecMisc2::Frintp, VectorSize::Size64x2),
(Opcode::Floor, F32X4) => (VecMisc2::Frintm, VectorSize::Size32x4),
(Opcode::Floor, F64X2) => (VecMisc2::Frintm, VectorSize::Size64x2),
(Opcode::Trunc, F32X4) => (VecMisc2::Frintz, VectorSize::Size32x4),
(Opcode::Trunc, F64X2) => (VecMisc2::Frintz, VectorSize::Size64x2),
(Opcode::Nearest, F32X4) => (VecMisc2::Frintn, VectorSize::Size32x4),
(Opcode::Nearest, F64X2) => (VecMisc2::Frintn, VectorSize::Size64x2),
_ => {
return Err(CodegenError::Unsupported(format!(
"{}: Unsupported type: {:?}",
op, ty
)))
}
};
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::VecMisc { op, rd, rn, size });
}
}
Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => implemented_in_isle(ctx),
Opcode::Fma => {
let ty = ty.unwrap();

View File

@@ -701,3 +701,212 @@ block0(v0: f64):
; fcvtzs x0, d7
; ret
function %f57(f32x2) -> f32x2 {
block0(v0: f32x2):
v1 = sqrt v0
return v1
}
; block0:
; fsqrt v0.2s, v0.2s
; ret
function %f58(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = sqrt v0
return v1
}
; block0:
; fsqrt v0.4s, v0.4s
; ret
function %f59(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = sqrt v0
return v1
}
; block0:
; fsqrt v0.2d, v0.2d
; ret
function %f60(f32x2) -> f32x2 {
block0(v0: f32x2):
v1 = fneg v0
return v1
}
; block0:
; fneg v0.2s, v0.2s
; ret
function %f61(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = fneg v0
return v1
}
; block0:
; fneg v0.4s, v0.4s
; ret
function %f62(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = fneg v0
return v1
}
; block0:
; fneg v0.2d, v0.2d
; ret
function %f63(f32x2) -> f32x2 {
block0(v0: f32x2):
v1 = fabs v0
return v1
}
; block0:
; fabs v0.2s, v0.2s
; ret
function %f64(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = fabs v0
return v1
}
; block0:
; fabs v0.4s, v0.4s
; ret
function %f65(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = fabs v0
return v1
}
; block0:
; fabs v0.2d, v0.2d
; ret
function %f66(f32x2) -> f32x2 {
block0(v0: f32x2):
v1 = ceil v0
return v1
}
; block0:
; frintp v0.2s, v0.2s
; ret
function %f67(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = ceil v0
return v1
}
; block0:
; frintp v0.4s, v0.4s
; ret
function %f68(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = ceil v0
return v1
}
; block0:
; frintp v0.2d, v0.2d
; ret
function %f69(f32x2) -> f32x2 {
block0(v0: f32x2):
v1 = floor v0
return v1
}
; block0:
; frintm v0.2s, v0.2s
; ret
function %f70(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = floor v0
return v1
}
; block0:
; frintm v0.4s, v0.4s
; ret
function %f71(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = floor v0
return v1
}
; block0:
; frintm v0.2d, v0.2d
; ret
function %f72(f32x2) -> f32x2 {
block0(v0: f32x2):
v1 = trunc v0
return v1
}
; block0:
; frintz v0.2s, v0.2s
; ret
function %f73(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = trunc v0
return v1
}
; block0:
; frintz v0.4s, v0.4s
; ret
function %f74(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = trunc v0
return v1
}
; block0:
; frintz v0.2d, v0.2d
; ret
function %f75(f32x2) -> f32x2 {
block0(v0: f32x2):
v1 = nearest v0
return v1
}
; block0:
; frintn v0.2s, v0.2s
; ret
function %f76(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = nearest v0
return v1
}
; block0:
; frintn v0.4s, v0.4s
; ret
function %f77(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = nearest v0
return v1
}
; block0:
; frintn v0.2d, v0.2d
; ret