arm64: Implement SIMD bitwise operations

Copyright (c) 2020, Arm Limited.
This commit is contained in:
Joey Gouly
2020-06-11 17:30:55 +01:00
committed by Chris Fallin
parent 2cfaae85b0
commit 544c5dece5
5 changed files with 223 additions and 60 deletions

View File

@@ -181,6 +181,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
},
"Cranelift" => match (testsuite, testname) {
("simd", "simd_address") => return false,
("simd", "simd_bitwise") => return false,
("simd", "simd_i8x16_cmp") => return false,
("simd", "simd_i16x8_cmp") => return false,
("simd", "simd_i32x4_cmp") => return false,

View File

@@ -1035,7 +1035,7 @@ impl MachInstEmit for Inst {
&Inst::VecMisc { op, rd, rn, ty } => {
let bits_12_16 = match op {
VecMisc2::Not => {
debug_assert_eq!(I8X16, ty);
debug_assert_eq!(128, ty_bits(ty));
0b00101
}
};
@@ -1256,6 +1256,28 @@ impl MachInstEmit for Inst {
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001101),
VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size_for_cmp << 1, 0b001111),
// The following instructions operate on bytes, so are not encoded differently
// for the different vector types.
VecALUOp::And => {
debug_assert_eq!(128, ty_bits(ty));
(0b010_01110_00_1, 0b000111)
}
VecALUOp::Bic => {
debug_assert_eq!(128, ty_bits(ty));
(0b010_01110_01_1, 0b000111)
}
VecALUOp::Orr => {
debug_assert_eq!(128, ty_bits(ty));
(0b010_01110_10_1, 0b000111)
}
VecALUOp::Eor => {
debug_assert_eq!(128, ty_bits(ty));
(0b011_01110_00_1, 0b000111)
}
VecALUOp::Bsl => {
debug_assert_eq!(128, ty_bits(ty));
(0b011_01110_01_1, 0b000111)
}
};
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
}

View File

@@ -2191,12 +2191,72 @@ fn test_aarch64_binemit() {
"cmhs v8.4s, v2.4s, v15.4s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::And,
rd: writable_vreg(20),
rn: vreg(19),
rm: vreg(18),
ty: I32X4,
},
"741E324E",
"and v20.16b, v19.16b, v18.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Bic,
rd: writable_vreg(8),
rn: vreg(11),
rm: vreg(1),
ty: I8X16,
},
"681D614E",
"bic v8.16b, v11.16b, v1.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Orr,
rd: writable_vreg(15),
rn: vreg(2),
rm: vreg(12),
ty: I16X8,
},
"4F1CAC4E",
"orr v15.16b, v2.16b, v12.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Eor,
rd: writable_vreg(18),
rn: vreg(3),
rm: vreg(22),
ty: I8X16,
},
"721C366E",
"eor v18.16b, v3.16b, v22.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Bsl,
rd: writable_vreg(8),
rn: vreg(9),
rm: vreg(1),
ty: I8X16,
},
"281D616E",
"bsl v8.16b, v9.16b, v1.16b",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Not,
rd: writable_vreg(2),
rn: vreg(1),
ty: I8X16,
ty: I32X4,
},
"2258206E",
"mvn v2.16b, v1.16b",

View File

@@ -225,6 +225,16 @@ pub enum VecALUOp {
Cmhs,
/// Compare unsigned higher or same
Cmhi,
/// Bitwise and
And,
/// Bitwise bit clear
Bic,
/// Bitwise inclusive or
Orr,
/// Bitwise exclusive or
Eor,
/// Bitwise select
Bsl,
}
/// A Vector miscellaneous operation with two registers.
@@ -1273,8 +1283,14 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(rd);
collector.add_use(rn);
}
&Inst::VecRRR { rd, rn, rm, .. } => {
collector.add_def(rd);
&Inst::VecRRR {
alu_op, rd, rn, rm, ..
} => {
if alu_op == VecALUOp::Bsl {
collector.add_mod(rd);
} else {
collector.add_def(rd);
}
collector.add_use(rn);
collector.add_use(rm);
}
@@ -1851,12 +1867,17 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
map_use(mapper, rn);
}
&mut Inst::VecRRR {
alu_op,
ref mut rd,
ref mut rn,
ref mut rm,
..
} => {
map_def(mapper, rd);
if alu_op == VecALUOp::Bsl {
map_mod(mapper, rd);
} else {
map_def(mapper, rd);
}
map_use(mapper, rn);
map_use(mapper, rm);
}
@@ -2663,16 +2684,21 @@ impl ShowWithRRU for Inst {
alu_op,
ty,
} => {
let (op, vector) = match alu_op {
VecALUOp::SQAddScalar => ("sqadd", false),
VecALUOp::UQAddScalar => ("uqadd", false),
VecALUOp::SQSubScalar => ("sqsub", false),
VecALUOp::UQSubScalar => ("uqsub", false),
VecALUOp::Cmeq => ("cmeq", true),
VecALUOp::Cmge => ("cmge", true),
VecALUOp::Cmgt => ("cmgt", true),
VecALUOp::Cmhs => ("cmhs", true),
VecALUOp::Cmhi => ("cmhi", true),
let (op, vector, ty) = match alu_op {
VecALUOp::SQAddScalar => ("sqadd", false, ty),
VecALUOp::UQAddScalar => ("uqadd", false, ty),
VecALUOp::SQSubScalar => ("sqsub", false, ty),
VecALUOp::UQSubScalar => ("uqsub", false, ty),
VecALUOp::Cmeq => ("cmeq", true, ty),
VecALUOp::Cmge => ("cmge", true, ty),
VecALUOp::Cmgt => ("cmgt", true, ty),
VecALUOp::Cmhs => ("cmhs", true, ty),
VecALUOp::Cmhi => ("cmhi", true, ty),
VecALUOp::And => ("and", true, I8X16),
VecALUOp::Bic => ("bic", true, I8X16),
VecALUOp::Orr => ("orr", true, I8X16),
VecALUOp::Eor => ("eor", true, I8X16),
VecALUOp::Bsl => ("bsl", true, I8X16),
};
let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector {
@@ -2686,9 +2712,14 @@ impl ShowWithRRU for Inst {
let rm = show_vreg_fn(rm, mb_rru, ty);
format!("{} {}, {}, {}", op, rd, rn, rm)
}
&Inst::VecMisc { op, rd, rn, ty } => {
let op = match op {
VecMisc2::Not => "mvn",
&Inst::VecMisc {
op,
rd,
rn,
ty: _ty,
} => {
let (op, ty) = match op {
VecMisc2::Not => ("mvn", I8X16),
};
let rd = show_vreg_vector(rd.to_reg(), mb_rru, ty);

View File

@@ -386,11 +386,21 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::Bnot => {
let rd = output_to_reg(ctx, outputs[0]);
let rm = input_to_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
let ty = ty.unwrap();
let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
// NOT rd, rm ==> ORR_NOT rd, zero, rm
ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
if ty_bits(ty) < 128 {
let rm = input_to_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
// NOT rd, rm ==> ORR_NOT rd, zero, rm
ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
} else {
let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::VecMisc {
op: VecMisc2::Not,
rd,
rn: rm,
ty,
});
}
}
Opcode::Band
@@ -400,19 +410,41 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::BorNot
| Opcode::BxorNot => {
let rd = output_to_reg(ctx, outputs[0]);
let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = input_to_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
let ty = ty.unwrap();
let alu_op = match op {
Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
_ => unreachable!(),
};
ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
if ty_bits(ty) < 128 {
let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = input_to_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
let alu_op = match op {
Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
_ => unreachable!(),
};
ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
} else {
let alu_op = match op {
Opcode::Band => VecALUOp::And,
Opcode::BandNot => VecALUOp::Bic,
Opcode::Bor => VecALUOp::Orr,
Opcode::Bxor => VecALUOp::Eor,
_ => unreachable!(),
};
let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
let rd = output_to_reg(ctx, outputs[0]);
ctx.emit(Inst::VecRRR {
alu_op,
rd,
rn,
rm,
ty,
});
}
}
Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
@@ -1035,32 +1067,49 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
Opcode::Bitselect => {
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
let rd = output_to_reg(ctx, outputs[0]);
let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
let rm = input_to_reg(ctx, inputs[2], NarrowValueMode::None);
// AND rTmp, rn, rcond
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::And64,
rd: tmp,
rn,
rm: rcond,
});
// BIC rd, rm, rcond
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::AndNot64,
rd,
rn: rm,
rm: rcond,
});
// ORR rd, rd, rTmp
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Orr64,
rd,
rn: rd.to_reg(),
rm: tmp.to_reg(),
});
let ty = ty.unwrap();
if ty_bits(ty) < 128 {
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
let rd = output_to_reg(ctx, outputs[0]);
let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
let rm = input_to_reg(ctx, inputs[2], NarrowValueMode::None);
// AND rTmp, rn, rcond
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::And64,
rd: tmp,
rn,
rm: rcond,
});
// BIC rd, rm, rcond
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::AndNot64,
rd,
rn: rm,
rm: rcond,
});
// ORR rd, rd, rTmp
ctx.emit(Inst::AluRRR {
alu_op: ALUOp::Orr64,
rd,
rn: rd.to_reg(),
rm: tmp.to_reg(),
});
} else {
let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
let rm = input_to_reg(ctx, inputs[2], NarrowValueMode::None);
let rd = output_to_reg(ctx, outputs[0]);
ctx.emit(Inst::gen_move(rd, rcond, ty));
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Bsl,
rd,
rn,
rm,
ty,
});
}
}
Opcode::Trueif => {