Merge pull request #1966 from jgouly/simd-arith

arm64: Implement basic SIMD arithmetic
This commit is contained in:
Chris Fallin
2020-07-02 11:07:14 -07:00
committed by GitHub
5 changed files with 277 additions and 30 deletions

View File

@@ -186,8 +186,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
("simd", "simd_boolean") => return false, ("simd", "simd_boolean") => return false,
("simd", "simd_f32x4_cmp") => return false, ("simd", "simd_f32x4_cmp") => return false,
("simd", "simd_f64x2_cmp") => return false, ("simd", "simd_f64x2_cmp") => return false,
("simd", "simd_i8x16_arith") => return false,
("simd", "simd_i8x16_cmp") => return false, ("simd", "simd_i8x16_cmp") => return false,
("simd", "simd_i16x8_arith") => return false,
("simd", "simd_i16x8_cmp") => return false, ("simd", "simd_i16x8_cmp") => return false,
("simd", "simd_i32x4_arith") => return false,
("simd", "simd_i32x4_cmp") => return false, ("simd", "simd_i32x4_cmp") => return false,
("simd", "simd_load_extend") => return false, ("simd", "simd_load_extend") => return false,
("simd", "simd_load_splat") => return false, ("simd", "simd_load_splat") => return false,

View File

@@ -355,10 +355,11 @@ fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
(top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg()) (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
} }
fn enc_vec_rr_misc(bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 { fn enc_vec_rr_misc(size: u32, bits_12_16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
debug_assert_eq!(size & 0b11, size);
debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16); debug_assert_eq!(bits_12_16 & 0b11111, bits_12_16);
let bits = 0b0_1_1_01110_00_10000_00000_10_00000_00000; let bits = 0b0_1_1_01110_00_10000_00000_10_00000_00000;
bits | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg()) bits | size << 22 | bits_12_16 << 12 | machreg_to_vec(rn) << 5 | machreg_to_vec(rd.to_reg())
} }
fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 { fn enc_vec_lanes(q: u32, u: u32, size: u32, opcode: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
@@ -1067,13 +1068,24 @@ impl MachInstEmit for Inst {
sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
} }
&Inst::VecMisc { op, rd, rn, ty } => { &Inst::VecMisc { op, rd, rn, ty } => {
let bits_12_16 = match op { let enc_size = match ty {
I8X16 => 0b00,
I16X8 => 0b01,
I32X4 => 0b10,
I64X2 => 0b11,
_ => 0,
};
let (bits_12_16, size) = match op {
VecMisc2::Not => { VecMisc2::Not => {
debug_assert_eq!(128, ty_bits(ty)); debug_assert_eq!(128, ty_bits(ty));
0b00101 (0b00101, 0b00)
}
VecMisc2::Neg => {
debug_assert_eq!(128, ty_bits(ty));
(0b01011, enc_size)
} }
}; };
sink.put4(enc_vec_rr_misc(bits_12_16, rd, rn)); sink.put4(enc_vec_rr_misc(size, bits_12_16, rd, rn));
} }
&Inst::VecLanes { op, rd, rn, ty } => { &Inst::VecLanes { op, rd, rn, ty } => {
let (q, size) = match ty { let (q, size) = match ty {
@@ -1277,6 +1289,7 @@ impl MachInstEmit for Inst {
I8X16 => 0b00, I8X16 => 0b00,
I16X8 => 0b01, I16X8 => 0b01,
I32X4 => 0b10, I32X4 => 0b10,
I64X2 => 0b11,
_ => 0, _ => 0,
}; };
let enc_size_for_fcmp = match ty { let enc_size_for_fcmp = match ty {
@@ -1333,6 +1346,12 @@ impl MachInstEmit for Inst {
(0b011_01110_01_1, 0b000111) (0b011_01110_01_1, 0b000111)
} }
VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001), VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::Mul => {
debug_assert_ne!(I64X2, ty);
(0b010_01110_00_1 | enc_size << 1, 0b100111)
}
}; };
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
} }

View File

@@ -2341,6 +2341,138 @@ fn test_aarch64_binemit() {
"umaxp v1.4s, v20.4s, v16.4s", "umaxp v1.4s, v20.4s, v16.4s",
)); ));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Add,
rd: writable_vreg(5),
rn: vreg(1),
rm: vreg(1),
ty: I8X16,
},
"2584214E",
"add v5.16b, v1.16b, v1.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Add,
rd: writable_vreg(7),
rn: vreg(13),
rm: vreg(2),
ty: I16X8,
},
"A785624E",
"add v7.8h, v13.8h, v2.8h",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Add,
rd: writable_vreg(18),
rn: vreg(9),
rm: vreg(6),
ty: I32X4,
},
"3285A64E",
"add v18.4s, v9.4s, v6.4s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Add,
rd: writable_vreg(1),
rn: vreg(3),
rm: vreg(2),
ty: I64X2,
},
"6184E24E",
"add v1.2d, v3.2d, v2.2d",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Sub,
rd: writable_vreg(5),
rn: vreg(1),
rm: vreg(1),
ty: I8X16,
},
"2584216E",
"sub v5.16b, v1.16b, v1.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Sub,
rd: writable_vreg(7),
rn: vreg(13),
rm: vreg(2),
ty: I16X8,
},
"A785626E",
"sub v7.8h, v13.8h, v2.8h",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Sub,
rd: writable_vreg(18),
rn: vreg(9),
rm: vreg(6),
ty: I32X4,
},
"3285A66E",
"sub v18.4s, v9.4s, v6.4s",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Sub,
rd: writable_vreg(18),
rn: vreg(0),
rm: vreg(8),
ty: I64X2,
},
"1284E86E",
"sub v18.2d, v0.2d, v8.2d",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Mul,
rd: writable_vreg(25),
rn: vreg(9),
rm: vreg(8),
ty: I8X16,
},
"399D284E",
"mul v25.16b, v9.16b, v8.16b",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Mul,
rd: writable_vreg(30),
rn: vreg(30),
rm: vreg(12),
ty: I16X8,
},
"DE9F6C4E",
"mul v30.8h, v30.8h, v12.8h",
));
insns.push((
Inst::VecRRR {
alu_op: VecALUOp::Mul,
rd: writable_vreg(18),
rn: vreg(18),
rm: vreg(18),
ty: I32X4,
},
"529EB24E",
"mul v18.4s, v18.4s, v18.4s",
));
insns.push(( insns.push((
Inst::VecMisc { Inst::VecMisc {
op: VecMisc2::Not, op: VecMisc2::Not,
@@ -2352,6 +2484,50 @@ fn test_aarch64_binemit() {
"mvn v2.16b, v1.16b", "mvn v2.16b, v1.16b",
)); ));
insns.push((
Inst::VecMisc {
op: VecMisc2::Neg,
rd: writable_vreg(8),
rn: vreg(12),
ty: I8X16,
},
"88B9206E",
"neg v8.16b, v12.16b",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Neg,
rd: writable_vreg(0),
rn: vreg(31),
ty: I16X8,
},
"E0BB606E",
"neg v0.8h, v31.8h",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Neg,
rd: writable_vreg(2),
rn: vreg(3),
ty: I32X4,
},
"62B8A06E",
"neg v2.4s, v3.4s",
));
insns.push((
Inst::VecMisc {
op: VecMisc2::Neg,
rd: writable_vreg(10),
rn: vreg(8),
ty: I64X2,
},
"0AB9E06E",
"neg v10.2d, v8.2d",
));
insns.push(( insns.push((
Inst::VecLanes { Inst::VecLanes {
op: VecLanesOp::Uminv, op: VecLanesOp::Uminv,

View File

@@ -243,13 +243,21 @@ pub enum VecALUOp {
Bsl, Bsl,
/// Unsigned maximum pairwise /// Unsigned maximum pairwise
Umaxp, Umaxp,
/// Add
Add,
/// Subtract
Sub,
/// Multiply
Mul,
} }
/// A Vector miscellaneous operation with two registers. /// A Vector miscellaneous operation with two registers.
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub enum VecMisc2 { pub enum VecMisc2 {
/// Bitwise NOT. /// Bitwise NOT
Not, Not,
/// Negate
Neg,
} }
/// An operation across the lanes of vectors. /// An operation across the lanes of vectors.
@@ -2737,6 +2745,9 @@ impl ShowWithRRU for Inst {
VecALUOp::Eor => ("eor", true, I8X16), VecALUOp::Eor => ("eor", true, I8X16),
VecALUOp::Bsl => ("bsl", true, I8X16), VecALUOp::Bsl => ("bsl", true, I8X16),
VecALUOp::Umaxp => ("umaxp", true, ty), VecALUOp::Umaxp => ("umaxp", true, ty),
VecALUOp::Add => ("add", true, ty),
VecALUOp::Sub => ("sub", true, ty),
VecALUOp::Mul => ("mul", true, ty),
}; };
let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector { let show_vreg_fn: fn(Reg, Option<&RealRegUniverse>, Type) -> String = if vector {
@@ -2750,14 +2761,10 @@ impl ShowWithRRU for Inst {
let rm = show_vreg_fn(rm, mb_rru, ty); let rm = show_vreg_fn(rm, mb_rru, ty);
format!("{} {}, {}, {}", op, rd, rn, rm) format!("{} {}, {}, {}", op, rd, rn, rm)
} }
&Inst::VecMisc { &Inst::VecMisc { op, rd, rn, ty } => {
op,
rd,
rn,
ty: _ty,
} => {
let (op, ty) = match op { let (op, ty) = match op {
VecMisc2::Not => ("mvn", I8X16), VecMisc2::Not => ("mvn", I8X16),
VecMisc2::Neg => ("neg", ty),
}; };
let rd = show_vreg_vector(rd.to_reg(), mb_rru, ty); let rd = show_vreg_vector(rd.to_reg(), mb_rru, ty);

View File

@@ -58,18 +58,40 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::Iadd => { Opcode::Iadd => {
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
let ty = ty.unwrap(); let ty = ty.unwrap();
let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64); if ty_bits(ty) < 128 {
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64);
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
} else {
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
ctx.emit(Inst::VecRRR {
rd,
rn,
rm,
alu_op: VecALUOp::Add,
ty,
});
}
} }
Opcode::Isub => { Opcode::Isub => {
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
let ty = ty.unwrap(); let ty = ty.unwrap();
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); if ty_bits(ty) < 128 {
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); let rm = put_input_in_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
} else {
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
ctx.emit(Inst::VecRRR {
rd,
rn,
rm,
alu_op: VecALUOp::Sub,
ty,
});
}
} }
Opcode::UaddSat | Opcode::SaddSat => { Opcode::UaddSat | Opcode::SaddSat => {
// We use the vector instruction set's saturating adds (UQADD / // We use the vector instruction set's saturating adds (UQADD /
@@ -143,11 +165,21 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
Opcode::Ineg => { Opcode::Ineg => {
let rd = get_output_reg(ctx, outputs[0]); let rd = get_output_reg(ctx, outputs[0]);
let rn = zero_reg();
let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None);
let ty = ty.unwrap(); let ty = ty.unwrap();
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); if ty_bits(ty) < 128 {
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); let rn = zero_reg();
let rm = put_input_in_rse_imm12(ctx, inputs[0], NarrowValueMode::None);
let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
} else {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
ctx.emit(Inst::VecMisc {
op: VecMisc2::Neg,
rd,
rn,
ty,
});
}
} }
Opcode::Imul => { Opcode::Imul => {
@@ -155,14 +187,24 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let ty = ty.unwrap(); let ty = ty.unwrap();
let alu_op = choose_32_64(ty, ALUOp::MAdd32, ALUOp::MAdd64); if ty_bits(ty) < 128 {
ctx.emit(Inst::AluRRRR { let alu_op = choose_32_64(ty, ALUOp::MAdd32, ALUOp::MAdd64);
alu_op, ctx.emit(Inst::AluRRRR {
rd, alu_op,
rn, rd,
rm, rn,
ra: zero_reg(), rm,
}); ra: zero_reg(),
});
} else {
ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Mul,
rd,
rn,
rm,
ty,
});
}
} }
Opcode::Umulhi | Opcode::Smulhi => { Opcode::Umulhi | Opcode::Smulhi => {