Convert fma, valltrue & vanytrue to ISLE (AArch64) (#4608)

* Convert `fma`, `valltrue` & `vanytrue` to ISLE (AArch64)

Ported the existing implementations of the following opcodes to ISLE on
AArch64:
- `fma`
  - Introduced missing support for `fma` on vector values, as per the
    docs.
- `valltrue`
- `vanytrue`

Also fixed `fcmp` on scalar values in the interpreter, and enabled
interpreter tests in `simd-fma.clif`.

This introduces the `FMLA` machine instruction.

Copyright (c) 2022 Arm Limited

* Add comments for `Fmla` and `Bsl`

Copyright (c) 2022 Arm Limited
This commit is contained in:
Damian Heaton
2022-08-05 17:47:56 +01:00
committed by GitHub
parent 1ed7b43e62
commit eb332b8369
19 changed files with 608 additions and 206 deletions

View File

@@ -335,8 +335,10 @@
(rn Reg))
;; 3-op FPU instruction.
;; 16-bit scalars require half-precision floating-point support (FEAT_FP16).
(FpuRRRR
(fpu_op FPUOp3)
(size ScalarSize)
(rd WritableReg)
(rn Reg)
(rm Reg)
@@ -478,7 +480,7 @@
(rd WritableReg)
(rn Reg)
(idx u8)
(size VectorSize))
(size ScalarSize))
;; Signed move from a vector element to a GPR.
(MovFromVecSigned
@@ -1011,8 +1013,7 @@
;; A floating-point unit (FPU) operation with three args.
(type FPUOp3
(enum
(MAdd32)
(MAdd64)
(MAdd)
))
;; A conversion from an FP to an integer value.
@@ -1108,6 +1109,8 @@
;; Bitwise exclusive or
(Eor)
;; Bitwise select
;; This opcode should only be used with the `vec_rrr_inplace`
;; constructor.
(Bsl)
;; Unsigned maximum pairwise
(Umaxp)
@@ -1143,6 +1146,10 @@
(Fmin)
;; Floating-point multiply
(Fmul)
;; Floating-point fused multiply-add vectors
;; This opcode should only be used with the `vec_rrr_inplace`
;; constructor.
(Fmla)
;; Add pairwise
(Addp)
;; Zip vectors (primary) [meaning, high halves]
@@ -1364,6 +1371,9 @@
(decl imm12_from_negated_u64 (Imm12) u64)
(extern extractor imm12_from_negated_u64 imm12_from_negated_u64)
(decl pure lshr_from_u64 (Type u64) ShiftOpAndAmt)
(extern constructor lshr_from_u64 lshr_from_u64)
(decl pure lshl_from_imm64 (Type Imm64) ShiftOpAndAmt)
(extern constructor lshl_from_imm64 lshl_from_imm64)
@@ -1494,6 +1504,15 @@
(rule (fpu_rr op src size)
(let ((dst WritableReg (temp_writable_reg $F64))
(_ Unit (emit (MInst.FpuRR op size dst src))))
dst))
;; Helper for emitting `MInst.VecRRR` instructions which use three registers,
;; one of which is both source and output.
(decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg)
(rule (vec_rrr_inplace op src1 src2 src3 size)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_1 Unit (emit (MInst.FpuMove128 dst src1)))
(_2 Unit (emit (MInst.VecRRR op dst src2 src3 size))))
dst))
;; Helper for emitting `MInst.FpuRRR` instructions.
@@ -1503,6 +1522,13 @@
(_ Unit (emit (MInst.FpuRRR op size dst src1 src2))))
dst))
;; Helper for emitting `MInst.FpuRRRR` instructions.
(decl fpu_rrrr (FPUOp3 ScalarSize Reg Reg Reg) Reg)
(rule (fpu_rrrr size op src1 src2 src3)
(let ((dst WritableReg (temp_writable_reg $F64))
(_ Unit (emit (MInst.FpuRRRR size op dst src1 src2 src3))))
dst))
;; Helper for emitting `MInst.FpuCmp` instructions.
(decl fpu_cmp (ScalarSize Reg Reg) ProducesFlags)
(rule (fpu_cmp size rn rm)
@@ -1544,6 +1570,15 @@
(_ Unit (emit (MInst.AluRRRShift op (operand_size ty) dst src1 src2 shift))))
dst))
;; Helper for emitting `cmp` instructions, setting flags, with a right-shifted
;; second operand register.
(decl cmp_rr_shift (OperandSize Reg Reg u64) ProducesFlags)
(rule (cmp_rr_shift size src1 src2 shift_amount)
(if-let shift (lshr_from_u64 $I64 shift_amount))
(ProducesFlags.ProducesFlagsSideEffect
(MInst.AluRRRShift (ALUOp.SubS) size (writable_zero_reg)
src1 src2 shift)))
;; Helper for emitting `MInst.AluRRRExtend` instructions.
(decl alu_rrr_extend (ALUOp Type Reg Reg ExtendOp) Reg)
(rule (alu_rrr_extend op ty src1 src2 extend)
@@ -1764,7 +1799,7 @@
dst))
;; Helper for emitting `MInst.MovFromVec` instructions.
(decl mov_from_vec (Reg u8 VectorSize) Reg)
(decl mov_from_vec (Reg u8 ScalarSize) Reg)
(rule (mov_from_vec rn idx size)
(let ((dst WritableReg (temp_writable_reg $I64))
(_ Unit (emit (MInst.MovFromVec dst rn idx size))))
@@ -1840,6 +1875,22 @@
(MInst.CSNeg dst cond if_true if_false)
dst)))
;; Helper for generating `MInst.CCmpImm` instructions.
(decl ccmp_imm (OperandSize u8 Reg UImm5 NZCV Cond) ConsumesFlags)
(rule (ccmp_imm size 1 rn imm nzcv cond)
(let ((dst WritableReg (temp_writable_reg $I64)))
(ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs
(MInst.CCmpImm size rn imm nzcv cond)
(MInst.CSet dst cond)
(value_reg dst))))
(rule (ccmp_imm size _ty_bits rn imm nzcv cond)
(let ((dst WritableReg (temp_writable_reg $I64)))
(ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs
(MInst.CCmpImm size rn imm nzcv cond)
(MInst.CSetm dst cond)
(value_reg dst))))
;; Helpers for generating `add` instructions.
(decl add (Type Reg Reg) Reg)

View File

@@ -620,7 +620,7 @@ impl ScalarSize {
/// Convert to an integer operand size.
pub fn operand_size(&self) -> OperandSize {
match self {
ScalarSize::Size32 => OperandSize::Size32,
ScalarSize::Size8 | ScalarSize::Size16 | ScalarSize::Size32 => OperandSize::Size32,
ScalarSize::Size64 => OperandSize::Size64,
_ => panic!("Unexpected operand_size request for: {:?}", self),
}
@@ -687,8 +687,11 @@ impl VectorSize {
debug_assert!(ty.is_vector());
match ty {
B8X8 => VectorSize::Size8x8,
B8X16 => VectorSize::Size8x16,
B16X4 => VectorSize::Size16x4,
B16X8 => VectorSize::Size16x8,
B32X2 => VectorSize::Size32x2,
B32X4 => VectorSize::Size32x4,
B64X2 => VectorSize::Size64x2,
F32X2 => VectorSize::Size32x2,

View File

@@ -1790,6 +1790,7 @@ impl MachInstEmit for Inst {
}
&Inst::FpuRRRR {
fpu_op,
size,
rd,
rn,
rm,
@@ -1800,9 +1801,9 @@ impl MachInstEmit for Inst {
let rm = allocs.next(rm);
let ra = allocs.next(ra);
let top17 = match fpu_op {
FPUOp3::MAdd32 => 0b000_11111_00_0_00000_0,
FPUOp3::MAdd64 => 0b000_11111_01_0_00000_0,
FPUOp3::MAdd => 0b000_11111_00_0_00000_0,
};
let top17 = top17 | size.ftype() << 7;
sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
}
&Inst::VecMisc { op, rd, rn, size } => {
@@ -2209,11 +2210,11 @@ impl MachInstEmit for Inst {
let rd = allocs.next_writable(rd);
let rn = allocs.next(rn);
let (q, imm5, shift, mask) = match size {
VectorSize::Size8x16 => (0b0, 0b00001, 1, 0b1111),
VectorSize::Size16x8 => (0b0, 0b00010, 2, 0b0111),
VectorSize::Size32x4 => (0b0, 0b00100, 3, 0b0011),
VectorSize::Size64x2 => (0b1, 0b01000, 4, 0b0001),
_ => unreachable!(),
ScalarSize::Size8 => (0b0, 0b00001, 1, 0b1111),
ScalarSize::Size16 => (0b0, 0b00010, 2, 0b0111),
ScalarSize::Size32 => (0b0, 0b00100, 3, 0b0011),
ScalarSize::Size64 => (0b1, 0b01000, 4, 0b0001),
_ => panic!("Unexpected scalar FP operand size: {:?}", size),
};
debug_assert_eq!(idx & mask, idx);
let imm5 = imm5 | ((idx as u32) << shift);
@@ -2542,7 +2543,8 @@ impl MachInstEmit for Inst {
| VecALUOp::Fdiv
| VecALUOp::Fmax
| VecALUOp::Fmin
| VecALUOp::Fmul => true,
| VecALUOp::Fmul
| VecALUOp::Fmla => true,
_ => false,
};
let enc_float_size = match (is_float, size) {
@@ -2617,6 +2619,7 @@ impl MachInstEmit for Inst {
VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
VecALUOp::Fmla => (0b000_01110_00_1, 0b110011),
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
VecALUOp::Sqrdmulh => {

View File

@@ -2266,7 +2266,7 @@ fn test_aarch64_binemit() {
rd: writable_xreg(3),
rn: vreg(27),
idx: 14,
size: VectorSize::Size8x16,
size: ScalarSize::Size8,
},
"633F1D0E",
"umov w3, v27.b[14]",
@@ -2276,7 +2276,7 @@ fn test_aarch64_binemit() {
rd: writable_xreg(24),
rn: vreg(5),
idx: 3,
size: VectorSize::Size16x8,
size: ScalarSize::Size16,
},
"B83C0E0E",
"umov w24, v5.h[3]",
@@ -2286,7 +2286,7 @@ fn test_aarch64_binemit() {
rd: writable_xreg(12),
rn: vreg(17),
idx: 1,
size: VectorSize::Size32x4,
size: ScalarSize::Size32,
},
"2C3E0C0E",
"mov w12, v17.s[1]",
@@ -2296,7 +2296,7 @@ fn test_aarch64_binemit() {
rd: writable_xreg(21),
rn: vreg(20),
idx: 0,
size: VectorSize::Size64x2,
size: ScalarSize::Size64,
},
"953E084E",
"mov x21, v20.d[0]",
@@ -4054,6 +4054,42 @@ fn test_aarch64_binemit() {
"fmul v2.2d, v0.2d, v5.2d",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Fmla,
rd: writable_vreg(2),
rn: vreg(0),
rm: vreg(5),
size: VectorSize::Size32x2,
},
"02CC250E",
"fmla v2.2s, v0.2s, v5.2s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Fmla,
rd: writable_vreg(2),
rn: vreg(0),
rm: vreg(5),
size: VectorSize::Size32x4,
},
"02CC254E",
"fmla v2.4s, v0.4s, v5.4s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Fmla,
rd: writable_vreg(2),
rn: vreg(0),
rm: vreg(5),
size: VectorSize::Size64x2,
},
"02CC654E",
"fmla v2.2d, v0.2d, v5.2d",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Addp,
@@ -5911,7 +5947,8 @@ fn test_aarch64_binemit() {
insns.push((
Inst::FpuRRRR {
fpu_op: FPUOp3::MAdd32,
fpu_op: FPUOp3::MAdd,
size: ScalarSize::Size32,
rd: writable_vreg(15),
rn: vreg(30),
rm: vreg(31),
@@ -5923,7 +5960,8 @@ fn test_aarch64_binemit() {
insns.push((
Inst::FpuRRRR {
fpu_op: FPUOp3::MAdd64,
fpu_op: FPUOp3::MAdd,
size: ScalarSize::Size64,
rd: writable_vreg(15),
rn: vreg(30),
rm: vreg(31),

View File

@@ -292,14 +292,6 @@ impl Imm12 {
}
}
/// Create a zero immediate of this format.
pub fn zero() -> Self {
Imm12 {
bits: 0,
shift12: false,
}
}
/// Bits for 2-bit "shift" field in e.g. AddI.
pub fn shift_bits(&self) -> u32 {
if self.shift12 {

View File

@@ -960,7 +960,7 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
&Inst::VecRRR {
alu_op, rd, rn, rm, ..
} => {
if alu_op == VecALUOp::Bsl {
if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Fmla {
collector.reg_mod(rd);
} else {
collector.reg_def(rd);
@@ -1705,7 +1705,7 @@ impl Inst {
}
&Inst::FpuMoveFromVec { rd, rn, idx, size } => {
let rd = pretty_print_vreg_scalar(rd.to_reg(), size.lane_size(), allocs);
let rn = pretty_print_vreg_element(rn, idx as usize, size, allocs);
let rn = pretty_print_vreg_element(rn, idx as usize, size.lane_size(), allocs);
format!("mov {}, {}", rd, rn)
}
&Inst::FpuExtend { rd, rn, size } => {
@@ -1777,14 +1777,14 @@ impl Inst {
}
&Inst::FpuRRRR {
fpu_op,
size,
rd,
rn,
rm,
ra,
} => {
let (op, size) = match fpu_op {
FPUOp3::MAdd32 => ("fmadd", ScalarSize::Size32),
FPUOp3::MAdd64 => ("fmadd", ScalarSize::Size64),
let op = match fpu_op {
FPUOp3::MAdd => "fmadd",
};
let rd = pretty_print_vreg_scalar(rd.to_reg(), size, allocs);
let rn = pretty_print_vreg_scalar(rn, size, allocs);
@@ -1965,16 +1965,17 @@ impl Inst {
format!("fmov {}, {}", rd, imm)
}
&Inst::MovToVec { rd, rn, idx, size } => {
let rd = pretty_print_vreg_element(rd.to_reg(), idx as usize, size, allocs);
let rd =
pretty_print_vreg_element(rd.to_reg(), idx as usize, size.lane_size(), allocs);
let rn = pretty_print_ireg(rn, size.operand_size(), allocs);
format!("mov {}, {}", rd, rn)
}
&Inst::MovFromVec { rd, rn, idx, size } => {
let op = match size {
VectorSize::Size8x16 => "umov",
VectorSize::Size16x8 => "umov",
VectorSize::Size32x4 => "mov",
VectorSize::Size64x2 => "mov",
ScalarSize::Size8 => "umov",
ScalarSize::Size16 => "umov",
ScalarSize::Size32 => "mov",
ScalarSize::Size64 => "mov",
_ => unimplemented!(),
};
let rd = pretty_print_ireg(rd.to_reg(), size.operand_size(), allocs);
@@ -1989,7 +1990,7 @@ impl Inst {
scalar_size,
} => {
let rd = pretty_print_ireg(rd.to_reg(), scalar_size, allocs);
let rn = pretty_print_vreg_element(rn, idx as usize, size, allocs);
let rn = pretty_print_vreg_element(rn, idx as usize, size.lane_size(), allocs);
format!("smov {}, {}", rd, rn)
}
&Inst::VecDup { rd, rn, size } => {
@@ -1999,7 +2000,7 @@ impl Inst {
}
&Inst::VecDupFromFpu { rd, rn, size } => {
let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
let rn = pretty_print_vreg_element(rn, 0, size, allocs);
let rn = pretty_print_vreg_element(rn, 0, size.lane_size(), allocs);
format!("dup {}, {}", rd, rn)
}
&Inst::VecDupFPImm { rd, imm, size } => {
@@ -2075,8 +2076,13 @@ impl Inst {
src_idx,
size,
} => {
let rd = pretty_print_vreg_element(rd.to_reg(), dest_idx as usize, size, allocs);
let rn = pretty_print_vreg_element(rn, src_idx as usize, size, allocs);
let rd = pretty_print_vreg_element(
rd.to_reg(),
dest_idx as usize,
size.lane_size(),
allocs,
);
let rn = pretty_print_vreg_element(rn, src_idx as usize, size.lane_size(), allocs);
format!("mov {}, {}", rd, rn)
}
&Inst::VecRRLong {
@@ -2220,6 +2226,7 @@ impl Inst {
VecALUOp::Fmax => ("fmax", size),
VecALUOp::Fmin => ("fmin", size),
VecALUOp::Fmul => ("fmul", size),
VecALUOp::Fmla => ("fmla", size),
VecALUOp::Addp => ("addp", size),
VecALUOp::Zip1 => ("zip1", size),
VecALUOp::Sqrdmulh => ("sqrdmulh", size),

View File

@@ -331,14 +331,15 @@ pub fn show_vreg_vector(reg: Reg, size: VectorSize) -> String {
}
/// Show an indexed vector element.
pub fn show_vreg_element(reg: Reg, idx: u8, size: VectorSize) -> String {
pub fn show_vreg_element(reg: Reg, idx: u8, size: ScalarSize) -> String {
assert_eq!(RegClass::Float, reg.class());
let s = show_reg(reg);
let suffix = match size {
VectorSize::Size8x8 | VectorSize::Size8x16 => ".b",
VectorSize::Size16x4 | VectorSize::Size16x8 => ".h",
VectorSize::Size32x2 | VectorSize::Size32x4 => ".s",
VectorSize::Size64x2 => ".d",
ScalarSize::Size8 => ".b",
ScalarSize::Size16 => ".h",
ScalarSize::Size32 => ".s",
ScalarSize::Size64 => ".d",
_ => panic!("Unexpected vector element size: {:?}", size),
};
format!("{}{}[{}]", s, suffix, idx)
}
@@ -373,7 +374,7 @@ pub fn pretty_print_vreg_vector(
pub fn pretty_print_vreg_element(
reg: Reg,
idx: usize,
size: VectorSize,
size: ScalarSize,
allocs: &mut AllocationConsumer<'_>,
) -> String {
let reg = allocs.next(reg);

View File

@@ -138,6 +138,69 @@
(rule (lower (scalar_to_vector x @ (value_type (int_bool_fits_in_32 _))))
(mov_to_fpu (put_in_reg_zext32 x) (ScalarSize.Size32)))
;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; cmeq vtmp.2d, vm.2d, #0
;; addp dtmp, vtmp.2d
;; fcmp dtmp, dtmp
;; cset xd, eq
;;
;; Note that after the ADDP the value of the temporary register will be either
;; 0 when all input elements are true, i.e. non-zero, or a NaN otherwise
;; (either -1 or -2 when represented as an integer); NaNs are the only
;; floating-point numbers that compare unequal to themselves.
(rule (lower (has_type out_ty (vall_true x @ (value_type (multi_lane 64 2)))))
(let ((x1 Reg (cmeq0 x (VectorSize.Size64x2)))
(x2 Reg (addp x1 x1 (VectorSize.Size64x2))))
(with_flags (fpu_cmp (ScalarSize.Size64) x2 x2)
(materialize_bool_result (ty_bits out_ty) (Cond.Eq)))))
(rule (lower (has_type out_ty (vall_true x @ (value_type (multi_lane 32 2)))))
(let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64))))
(with_flags (cmp_rr_shift (OperandSize.Size64) (zero_reg) x1 32)
(ccmp_imm
(OperandSize.Size32)
(ty_bits out_ty)
x1
(u8_into_uimm5 0)
(nzcv $false $true $false $false)
(Cond.Ne)))))
;; This operation is implemented by using uminv to create a scalar value, which
;; is then compared against zero.
;;
;; uminv bn, vm.16b
;; mov xm, vn.d[0]
;; cmp xm, #0
;; cset xm, ne
(rule (lower (has_type out_ty (vall_true x @ (value_type (lane_fits_in_32 ty)))))
(if (not_vec32x2 ty))
(let ((x1 Reg (vec_lanes (VecLanesOp.Uminv) x (vector_size ty)))
(x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64))))
(with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0))
(materialize_bool_result (ty_bits out_ty) (Cond.Ne)))))
;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This operation is implemented by using umaxp to create a scalar value, which
;; is then compared against zero.
;;
;; umaxp vn.4s, vm.4s, vm.4s
;; mov xm, vn.d[0]
;; cmp xm, #0
;; cset xm, ne
(rule (lower (vany_true x @ (value_type (ty_vec128 ty))))
(let ((x1 Reg (vec_rrr (VecALUOp.Umaxp) x x (VectorSize.Size32x4)))
(x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64))))
(with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0))
(materialize_bool_result (ty_bits ty) (Cond.Ne)))))
(rule (lower (vany_true x @ (value_type ty)))
(if (ty_vec64 ty))
(let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64))))
(with_flags (cmp_imm (OperandSize.Size64) x1 (u8_into_imm12 0))
(materialize_bool_result (ty_bits ty) (Cond.Ne)))))
;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $I16X8 (iadd_pairwise (swiden_low x) (swiden_high y))))
@@ -314,6 +377,13 @@
(rule (lower (has_type $F64 (nearest x)))
(fpu_round (FpuRoundMode.Nearest64) x))
;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
(vec_rrr_inplace (VecALUOp.Fmla) z x y (vector_size ty)))
(rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
(fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -732,7 +802,7 @@
(rule (lower (has_type (fits_in_64 out)
(uextend (extractlane vec @ (value_type in)
(u8_from_uimm8 lane)))))
(mov_from_vec (put_in_reg vec) lane (vector_size in)))
(mov_from_vec (put_in_reg vec) lane (lane_size in)))
;; Atomic loads will also automatically zero their upper bits so the `uextend`
;; instruction can effectively get skipped here.
@@ -750,7 +820,7 @@
(rule (lower (has_type $I128
(uextend (extractlane vec @ (value_type in)
(u8_from_uimm8 lane)))))
(value_regs (mov_from_vec (put_in_reg vec) lane (vector_size in)) (imm $I64 (ImmExtend.Zero) 0)))
(value_regs (mov_from_vec (put_in_reg vec) lane (lane_size in)) (imm $I64 (ImmExtend.Zero) 0)))
;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -796,7 +866,7 @@
(u8_from_uimm8 lane)))))
(let ((lo Reg (mov_from_vec (put_in_reg vec)
lane
(VectorSize.Size64x2)))
(ScalarSize.Size64)))
(hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63))))
(value_regs lo hi)))
@@ -1410,26 +1480,26 @@
(rule (lower (has_type $I8 (popcnt x)))
(let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
(nbits Reg (vec_cnt tmp (VectorSize.Size8x8))))
(mov_from_vec nbits 0 (VectorSize.Size8x16))))
(mov_from_vec nbits 0 (ScalarSize.Size8))))
;; Note that this uses `addp` instead of `addv` as it's usually cheaper.
(rule (lower (has_type $I16 (popcnt x)))
(let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
(nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
(added Reg (addp nbits nbits (VectorSize.Size8x8))))
(mov_from_vec added 0 (VectorSize.Size8x16))))
(mov_from_vec added 0 (ScalarSize.Size8))))
(rule (lower (has_type $I32 (popcnt x)))
(let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
(nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
(added Reg (addv nbits (VectorSize.Size8x8))))
(mov_from_vec added 0 (VectorSize.Size8x16))))
(mov_from_vec added 0 (ScalarSize.Size8))))
(rule (lower (has_type $I64 (popcnt x)))
(let ((tmp Reg (mov_to_fpu x (ScalarSize.Size64)))
(nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
(added Reg (addv nbits (VectorSize.Size8x8))))
(mov_from_vec added 0 (VectorSize.Size8x16))))
(mov_from_vec added 0 (ScalarSize.Size8))))
(rule (lower (has_type $I128 (popcnt x)))
(let ((val ValueRegs x)
@@ -1437,7 +1507,7 @@
(tmp Reg (mov_to_vec tmp_half (value_regs_get val 1) 1 (VectorSize.Size64x2)))
(nbits Reg (vec_cnt tmp (VectorSize.Size8x16)))
(added Reg (addv nbits (VectorSize.Size8x16))))
(value_regs (mov_from_vec added 0 (VectorSize.Size8x16)) (imm $I64 (ImmExtend.Zero) 0))))
(value_regs (mov_from_vec added 0 (ScalarSize.Size8)) (imm $I64 (ImmExtend.Zero) 0))))
(rule (lower (has_type $I8X16 (popcnt x)))
(vec_cnt x (VectorSize.Size8x16)))

View File

@@ -106,6 +106,16 @@ where
ImmShift::maybe_from_u64(n.into()).unwrap()
}
fn lshr_from_u64(&mut self, ty: Type, n: u64) -> Option<ShiftOpAndAmt> {
let shiftimm = ShiftOpShiftImm::maybe_from_shift(n)?;
if let Ok(bits) = u8::try_from(ty_bits(ty)) {
let shiftimm = shiftimm.mask(bits);
Some(ShiftOpAndAmt::new(ShiftOp::LSR, shiftimm))
} else {
None
}
}
fn lshl_from_imm64(&mut self, ty: Type, n: Imm64) -> Option<ShiftOpAndAmt> {
let shiftimm = ShiftOpShiftImm::maybe_from_shift(n.bits() as u64)?;
let shiftee_bits = ty_bits(ty);

View File

@@ -457,7 +457,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
(true, false) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
let size = ScalarSize::from_bits(oty_bits);
ctx.emit(Inst::MovFromVec {
rd,
@@ -685,7 +685,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let ty = ty.unwrap();
if ty_has_int_representation(ty) {
ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
ctx.emit(Inst::MovFromVec {
rd,
rn,
idx,
size: size.lane_size(),
});
// Plain moves are faster on some processors.
} else if idx == 0 {
ctx.emit(Inst::gen_move(rd, rn, ty));
@@ -729,115 +734,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::ScalarToVector => implemented_in_isle(ctx),
Opcode::VallTrue if ctx.input_ty(insn, 0).lane_bits() == 64 => {
let input_ty = ctx.input_ty(insn, 0);
if input_ty.lane_count() != 2 {
return Err(CodegenError::Unsupported(format!(
"VallTrue: unsupported type {:?}",
input_ty
)));
}
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();
// cmeq vtmp.2d, vm.2d, #0
// addp dtmp, vtmp.2d
// fcmp dtmp, dtmp
// cset xd, eq
//
// Note that after the ADDP the value of the temporary register will
// be either 0 when all input elements are true, i.e. non-zero, or a
// NaN otherwise (either -1 or -2 when represented as an integer);
// NaNs are the only floating-point numbers that compare unequal to
// themselves.
ctx.emit(Inst::VecMisc {
op: VecMisc2::Cmeq0,
rd: tmp,
rn: rm,
size: VectorSize::Size64x2,
});
ctx.emit(Inst::VecRRPair {
op: VecPairOp::Addp,
rd: tmp,
rn: tmp.to_reg(),
});
ctx.emit(Inst::FpuCmp {
size: ScalarSize::Size64,
rn: tmp.to_reg(),
rm: tmp.to_reg(),
});
materialize_bool_result(ctx, insn, rd, Cond::Eq);
}
Opcode::VanyTrue | Opcode::VallTrue => {
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let src_ty = ctx.input_ty(insn, 0);
let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
// This operation is implemented by using umaxp or uminv to
// create a scalar value, which is then compared against zero.
//
// umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
// mov xm, vn.d[0]
// cmp xm, #0
// cset xm, ne
let s = VectorSize::from_ty(src_ty);
let size = if s == VectorSize::Size64x2 {
// `vall_true` with 64-bit elements is handled elsewhere.
debug_assert_ne!(op, Opcode::VallTrue);
VectorSize::Size32x4
} else {
s
};
if op == Opcode::VanyTrue {
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Umaxp,
rd: tmp,
rn: rm,
rm,
size,
});
} else {
if size == VectorSize::Size32x2 {
return Err(CodegenError::Unsupported(format!(
"VallTrue: Unsupported type: {:?}",
src_ty
)));
}
ctx.emit(Inst::VecLanes {
op: VecLanesOp::Uminv,
rd: tmp,
rn: rm,
size,
});
};
ctx.emit(Inst::MovFromVec {
rd,
rn: tmp.to_reg(),
idx: 0,
size: VectorSize::Size64x2,
});
ctx.emit(Inst::AluRRImm12 {
alu_op: ALUOp::SubS,
size: OperandSize::Size64,
rd: writable_zero_reg(),
rn: rd.to_reg(),
imm12: Imm12::zero(),
});
materialize_bool_result(ctx, insn, rd, Cond::Ne);
}
Opcode::VallTrue | Opcode::VanyTrue => implemented_in_isle(ctx),
Opcode::VhighBits => {
let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
@@ -904,7 +801,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
rd: dst_r,
rn: tmp_v0.to_reg(),
idx: 0,
size: VectorSize::Size16x8,
size: ScalarSize::Size16,
});
}
I16X8 => {
@@ -962,7 +859,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
rd: dst_r,
rn: tmp_v0.to_reg(),
idx: 0,
size: VectorSize::Size16x8,
size: ScalarSize::Size16,
});
}
I32X4 => {
@@ -1018,7 +915,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
rd: dst_r,
rn: tmp_v0.to_reg(),
idx: 0,
size: VectorSize::Size32x4,
size: ScalarSize::Size32,
});
}
I64X2 => {
@@ -1031,13 +928,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
rd: dst_r,
rn: src_v,
idx: 0,
size: VectorSize::Size64x2,
size: ScalarSize::Size64,
});
ctx.emit(Inst::MovFromVec {
rd: tmp_r0,
rn: src_v,
idx: 1,
size: VectorSize::Size64x2,
size: ScalarSize::Size64,
});
ctx.emit(Inst::AluRRImmShift {
alu_op: ALUOp::Lsr,
@@ -1139,31 +1036,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => implemented_in_isle(ctx),
Opcode::Fma => {
let ty = ty.unwrap();
let bits = ty_bits(ty);
let fpu_op = match bits {
32 => FPUOp3::MAdd32,
64 => FPUOp3::MAdd64,
_ => {
return Err(CodegenError::Unsupported(format!(
"Fma: Unsupported type: {:?}",
ty
)))
}
};
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
ctx.emit(Inst::FpuRRRR {
fpu_op,
rn,
rm,
ra,
rd,
});
}
Opcode::Fma => implemented_in_isle(ctx),
Opcode::Fcopysign => {
// Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence: