CL/aarch64 back end: implement the wasm SIMD bitmask instructions
The `bitmask.{8x16,16x8,32x4}` instructions do not map neatly to any single
AArch64 SIMD instruction, and instead need a sequence of around ten
instructions. Because of this, this patch is somewhat longer and more complex
than it would be for (eg) x64.
Main changes are:
* the relevant testsuite test (`simd_boolean.wast`) has been enabled on aarch64.
* at the CLIF level, add a new instruction `vhigh_bits`, into which these wasm
instructions are to be translated.
* in the wasm->CLIF translation (code_translator.rs), translate into
`vhigh_bits`. This is straightforward.
* in the CLIF->AArch64 translation (lower_inst.rs), translate `vhigh_bits`
into equivalent sequences of AArch64 instructions. There is a different
sequence for each of the `{8x16, 16x8, 32x4}` variants.
All other changes are AArch64-specific, and add instruction definitions needed
by the previous step:
* Add two new families of AArch64 instructions: `VecShiftImm` (vector shift by
immediate) and `VecExtract` (effectively a double-length vector shift)
* To the existing AArch64 family `VecRRR`, add a `zip1` variant. To the
`VecLanesOp` family add an `addv` variant.
* Add supporting code for the above changes to AArch64 instructions:
- getting the register uses (`aarch64_get_regs`)
- mapping the registers (`aarch64_map_regs`)
- printing instructions
- emitting instructions (`impl MachInstEmit for Inst`). The handling of
`VecShiftImm` is a bit complex.
- emission tests for new instructions and variants.
This commit is contained in:
committed by
julian-seward1
parent
b10e027fef
commit
2702942050
6
build.rs
6
build.rs
@@ -229,8 +229,12 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
|
||||
return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "x86_64";
|
||||
}
|
||||
|
||||
// This is only implemented on aarch64.
|
||||
("simd", "simd_boolean") => {
|
||||
return env::var("CARGO_CFG_TARGET_ARCH").unwrap() != "aarch64";
|
||||
}
|
||||
|
||||
// These tests have simd operators which aren't implemented yet.
|
||||
("simd", "simd_boolean") => return true,
|
||||
("simd", "simd_f32x4_pmin_pmax") => return true,
|
||||
("simd", "simd_f32x4_rounding") => return true,
|
||||
("simd", "simd_f64x2_pmin_pmax") => return true,
|
||||
|
||||
@@ -2193,6 +2193,24 @@ pub(crate) fn define(
|
||||
.operands_out(vec![s]),
|
||||
);
|
||||
|
||||
let a = &Operand::new("a", TxN);
|
||||
let x = &Operand::new("x", Int);
|
||||
|
||||
ig.push(
|
||||
Inst::new(
|
||||
"vhigh_bits",
|
||||
r#"
|
||||
Reduce a vector to a scalar integer.
|
||||
|
||||
Return a scalar integer, consisting of the concatenation of the most significant bit
|
||||
of each lane of ``a``.
|
||||
"#,
|
||||
&formats.unary,
|
||||
)
|
||||
.operands_in(vec![a])
|
||||
.operands_out(vec![x]),
|
||||
);
|
||||
|
||||
let a = &Operand::new("a", &Int.as_bool());
|
||||
let Cond = &Operand::new("Cond", &imm.intcc);
|
||||
let x = &Operand::new("x", Int);
|
||||
|
||||
@@ -1441,9 +1441,67 @@ impl MachInstEmit for Inst {
|
||||
};
|
||||
let (u, opcode) = match op {
|
||||
VecLanesOp::Uminv => (0b1, 0b11010),
|
||||
VecLanesOp::Addv => (0b0, 0b11011),
|
||||
};
|
||||
sink.put4(enc_vec_lanes(q, u, size, opcode, rd, rn));
|
||||
}
|
||||
&Inst::VecShiftImm {
|
||||
op,
|
||||
rd,
|
||||
rn,
|
||||
size,
|
||||
imm,
|
||||
} => {
|
||||
let (is_shr, template) = match op {
|
||||
VecShiftImmOp::Ushr => (true, 0b_011_011110_0000_000_000001_00000_00000_u32),
|
||||
VecShiftImmOp::Sshr => (true, 0b_010_011110_0000_000_000001_00000_00000_u32),
|
||||
VecShiftImmOp::Shl => (false, 0b_010_011110_0000_000_010101_00000_00000_u32),
|
||||
};
|
||||
let imm = imm as u32;
|
||||
// Deal with the somewhat strange encoding scheme for, and limits on,
|
||||
// the shift amount.
|
||||
let immh_immb = match (size, is_shr) {
|
||||
(VectorSize::Size64x2, true) if imm >= 1 && imm <= 64 => {
|
||||
0b_1000_000_u32 | (64 - imm)
|
||||
}
|
||||
(VectorSize::Size32x4, true) if imm >= 1 && imm <= 32 => {
|
||||
0b_0100_000_u32 | (32 - imm)
|
||||
}
|
||||
(VectorSize::Size16x8, true) if imm >= 1 && imm <= 16 => {
|
||||
0b_0010_000_u32 | (16 - imm)
|
||||
}
|
||||
(VectorSize::Size8x16, true) if imm >= 1 && imm <= 8 => {
|
||||
0b_0001_000_u32 | (8 - imm)
|
||||
}
|
||||
(VectorSize::Size64x2, false) if imm <= 63 => 0b_1000_000_u32 | imm,
|
||||
(VectorSize::Size32x4, false) if imm <= 31 => 0b_0100_000_u32 | imm,
|
||||
(VectorSize::Size16x8, false) if imm <= 15 => 0b_0010_000_u32 | imm,
|
||||
(VectorSize::Size8x16, false) if imm <= 7 => 0b_0001_000_u32 | imm,
|
||||
_ => panic!(
|
||||
"aarch64: Inst::VecShiftImm: emit: invalid op/size/imm {:?}, {:?}, {:?}",
|
||||
op, size, imm
|
||||
),
|
||||
};
|
||||
let rn_enc = machreg_to_vec(rn);
|
||||
let rd_enc = machreg_to_vec(rd.to_reg());
|
||||
sink.put4(template | (immh_immb << 16) | (rn_enc << 5) | rd_enc);
|
||||
}
|
||||
&Inst::VecExtract { rd, rn, rm, imm4 } => {
|
||||
if imm4 < 16 {
|
||||
let template = 0b_01_101110_000_00000_0_0000_0_00000_00000_u32;
|
||||
let rm_enc = machreg_to_vec(rm);
|
||||
let rn_enc = machreg_to_vec(rn);
|
||||
let rd_enc = machreg_to_vec(rd.to_reg());
|
||||
sink.put4(
|
||||
template | (rm_enc << 16) | ((imm4 as u32) << 11) | (rn_enc << 5) | rd_enc,
|
||||
);
|
||||
} else {
|
||||
panic!(
|
||||
"aarch64: Inst::VecExtract: emit: invalid extract index {}",
|
||||
imm4
|
||||
);
|
||||
}
|
||||
}
|
||||
&Inst::VecTbl {
|
||||
rd,
|
||||
rn,
|
||||
@@ -1827,6 +1885,7 @@ impl MachInstEmit for Inst {
|
||||
debug_assert!(!size.is_128bits());
|
||||
(0b001_01110_00_1 | enc_size << 1, 0b100000)
|
||||
}
|
||||
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
|
||||
};
|
||||
let top11 = if is_float {
|
||||
top11 | enc_float_size << 1
|
||||
|
||||
@@ -3175,6 +3175,54 @@ fn test_aarch64_binemit() {
|
||||
"umlal v9.2d, v20.2s, v17.2s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Zip1,
|
||||
rd: writable_vreg(16),
|
||||
rn: vreg(12),
|
||||
rm: vreg(1),
|
||||
size: VectorSize::Size8x16,
|
||||
},
|
||||
"9039014E",
|
||||
"zip1 v16.16b, v12.16b, v1.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Zip1,
|
||||
rd: writable_vreg(2),
|
||||
rn: vreg(13),
|
||||
rm: vreg(6),
|
||||
size: VectorSize::Size16x8,
|
||||
},
|
||||
"A239464E",
|
||||
"zip1 v2.8h, v13.8h, v6.8h",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Zip1,
|
||||
rd: writable_vreg(8),
|
||||
rn: vreg(12),
|
||||
rm: vreg(14),
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"88398E4E",
|
||||
"zip1 v8.4s, v12.4s, v14.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecRRR {
|
||||
alu_op: VecALUOp::Zip1,
|
||||
rd: writable_vreg(9),
|
||||
rn: vreg(20),
|
||||
rm: vreg(17),
|
||||
size: VectorSize::Size64x2,
|
||||
},
|
||||
"893AD14E",
|
||||
"zip1 v9.2d, v20.2d, v17.2d",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecMisc {
|
||||
op: VecMisc2::Not,
|
||||
@@ -3461,6 +3509,168 @@ fn test_aarch64_binemit() {
|
||||
"uminv s18, v4.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecLanes {
|
||||
op: VecLanesOp::Addv,
|
||||
rd: writable_vreg(2),
|
||||
rn: vreg(29),
|
||||
size: VectorSize::Size8x16,
|
||||
},
|
||||
"A2BB314E",
|
||||
"addv b2, v29.16b",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecLanes {
|
||||
op: VecLanesOp::Addv,
|
||||
rd: writable_vreg(3),
|
||||
rn: vreg(21),
|
||||
size: VectorSize::Size16x8,
|
||||
},
|
||||
"A3BA714E",
|
||||
"addv h3, v21.8h",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecLanes {
|
||||
op: VecLanesOp::Addv,
|
||||
rd: writable_vreg(18),
|
||||
rn: vreg(5),
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"B2B8B14E",
|
||||
"addv s18, v5.4s",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Shl,
|
||||
rd: writable_vreg(27),
|
||||
rn: vreg(5),
|
||||
imm: 7,
|
||||
size: VectorSize::Size8x16,
|
||||
},
|
||||
"BB540F4F",
|
||||
"shl v27.16b, v5.16b, #7",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Shl,
|
||||
rd: writable_vreg(1),
|
||||
rn: vreg(30),
|
||||
imm: 0,
|
||||
size: VectorSize::Size8x16,
|
||||
},
|
||||
"C157084F",
|
||||
"shl v1.16b, v30.16b, #0",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Sshr,
|
||||
rd: writable_vreg(26),
|
||||
rn: vreg(6),
|
||||
imm: 16,
|
||||
size: VectorSize::Size16x8,
|
||||
},
|
||||
"DA04104F",
|
||||
"sshr v26.8h, v6.8h, #16",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Sshr,
|
||||
rd: writable_vreg(3),
|
||||
rn: vreg(19),
|
||||
imm: 1,
|
||||
size: VectorSize::Size16x8,
|
||||
},
|
||||
"63061F4F",
|
||||
"sshr v3.8h, v19.8h, #1",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Ushr,
|
||||
rd: writable_vreg(25),
|
||||
rn: vreg(6),
|
||||
imm: 32,
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"D904206F",
|
||||
"ushr v25.4s, v6.4s, #32",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Ushr,
|
||||
rd: writable_vreg(5),
|
||||
rn: vreg(21),
|
||||
imm: 1,
|
||||
size: VectorSize::Size32x4,
|
||||
},
|
||||
"A5063F6F",
|
||||
"ushr v5.4s, v21.4s, #1",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Shl,
|
||||
rd: writable_vreg(22),
|
||||
rn: vreg(13),
|
||||
imm: 63,
|
||||
size: VectorSize::Size64x2,
|
||||
},
|
||||
"B6557F4F",
|
||||
"shl v22.2d, v13.2d, #63",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Shl,
|
||||
rd: writable_vreg(23),
|
||||
rn: vreg(9),
|
||||
imm: 0,
|
||||
size: VectorSize::Size64x2,
|
||||
},
|
||||
"3755404F",
|
||||
"shl v23.2d, v9.2d, #0",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecExtract {
|
||||
rd: writable_vreg(1),
|
||||
rn: vreg(30),
|
||||
rm: vreg(17),
|
||||
imm4: 0,
|
||||
},
|
||||
"C103116E",
|
||||
"ext v1.16b, v30.16b, v17.16b, #0",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecExtract {
|
||||
rd: writable_vreg(1),
|
||||
rn: vreg(30),
|
||||
rm: vreg(17),
|
||||
imm4: 8,
|
||||
},
|
||||
"C143116E",
|
||||
"ext v1.16b, v30.16b, v17.16b, #8",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecExtract {
|
||||
rd: writable_vreg(1),
|
||||
rn: vreg(30),
|
||||
rm: vreg(17),
|
||||
imm4: 15,
|
||||
},
|
||||
"C17B116E",
|
||||
"ext v1.16b, v30.16b, v17.16b, #15",
|
||||
));
|
||||
|
||||
insns.push((
|
||||
Inst::VecTbl {
|
||||
rd: writable_vreg(0),
|
||||
|
||||
@@ -287,6 +287,8 @@ pub enum VecALUOp {
|
||||
Addp,
|
||||
/// Unsigned multiply add long
|
||||
Umlal,
|
||||
/// Zip vectors (primary) [meaning, high halves]
|
||||
Zip1,
|
||||
}
|
||||
|
||||
/// A Vector miscellaneous operation with two registers.
|
||||
@@ -332,10 +334,23 @@ pub enum VecMiscNarrowOp {
|
||||
/// An operation across the lanes of vectors.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum VecLanesOp {
|
||||
/// Integer addition across a vector
|
||||
Addv,
|
||||
/// Unsigned minimum across a vector
|
||||
Uminv,
|
||||
}
|
||||
|
||||
/// A shift-by-immediate operation on each lane of a vector.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub enum VecShiftImmOp {
|
||||
// Unsigned shift left
|
||||
Shl,
|
||||
// Unsigned shift right
|
||||
Ushr,
|
||||
// Signed shift right
|
||||
Sshr,
|
||||
}
|
||||
|
||||
/// An operation on the bits of a register. This can be paired with several instruction formats
|
||||
/// below (see `Inst`) in any combination.
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
@@ -949,6 +964,28 @@ pub enum Inst {
|
||||
size: VectorSize,
|
||||
},
|
||||
|
||||
/// Vector shift by immediate: Shift Left (immediate), Unsigned Shift Right (immediate),
|
||||
/// Signed Shift Right (immediate). These are somewhat unusual in that, for right shifts,
|
||||
/// the allowed range of `imm` values is 1 to lane-size-in-bits, inclusive. A zero
|
||||
/// right-shift cannot be encoded. Left shifts are "normal", though, having valid `imm`
|
||||
/// values from 0 to lane-size-in-bits - 1 inclusive.
|
||||
VecShiftImm {
|
||||
op: VecShiftImmOp,
|
||||
rd: Writable<Reg>,
|
||||
rn: Reg,
|
||||
size: VectorSize,
|
||||
imm: u8,
|
||||
},
|
||||
|
||||
/// Vector extract - create a new vector, being the concatenation of the lowest `imm4` bytes
|
||||
/// of `rm` followed by the uppermost `16 - imm4` bytes of `rn`.
|
||||
VecExtract {
|
||||
rd: Writable<Reg>,
|
||||
rn: Reg,
|
||||
rm: Reg,
|
||||
imm4: u8,
|
||||
},
|
||||
|
||||
/// Table vector lookup - single register table. The table consists of 8-bit elements and is
|
||||
/// stored in `rn`, while `rm` contains 8-bit element indices. `is_extension` specifies whether
|
||||
/// to emit a TBX or a TBL instruction, i.e. whether to leave the elements in the destination
|
||||
@@ -1577,6 +1614,15 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
|
||||
collector.add_def(rd);
|
||||
collector.add_use(rn);
|
||||
}
|
||||
&Inst::VecShiftImm { rd, rn, .. } => {
|
||||
collector.add_def(rd);
|
||||
collector.add_use(rn);
|
||||
}
|
||||
&Inst::VecExtract { rd, rn, rm, .. } => {
|
||||
collector.add_def(rd);
|
||||
collector.add_use(rn);
|
||||
collector.add_use(rm);
|
||||
}
|
||||
&Inst::VecTbl {
|
||||
rd,
|
||||
rn,
|
||||
@@ -2157,6 +2203,24 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
|
||||
map_def(mapper, rd);
|
||||
map_use(mapper, rn);
|
||||
}
|
||||
&mut Inst::VecShiftImm {
|
||||
ref mut rd,
|
||||
ref mut rn,
|
||||
..
|
||||
} => {
|
||||
map_def(mapper, rd);
|
||||
map_use(mapper, rn);
|
||||
}
|
||||
&mut Inst::VecExtract {
|
||||
ref mut rd,
|
||||
ref mut rn,
|
||||
ref mut rm,
|
||||
..
|
||||
} => {
|
||||
map_def(mapper, rd);
|
||||
map_use(mapper, rn);
|
||||
map_use(mapper, rm);
|
||||
}
|
||||
&mut Inst::VecTbl {
|
||||
ref mut rd,
|
||||
ref mut rn,
|
||||
@@ -3330,6 +3394,7 @@ impl Inst {
|
||||
VecALUOp::Fmul => ("fmul", size),
|
||||
VecALUOp::Addp => ("addp", size),
|
||||
VecALUOp::Umlal => ("umlal", size),
|
||||
VecALUOp::Zip1 => ("zip1", size),
|
||||
};
|
||||
let rd_size = if alu_op == VecALUOp::Umlal {
|
||||
size.widen()
|
||||
@@ -3381,11 +3446,28 @@ impl Inst {
|
||||
&Inst::VecLanes { op, rd, rn, size } => {
|
||||
let op = match op {
|
||||
VecLanesOp::Uminv => "uminv",
|
||||
VecLanesOp::Addv => "addv",
|
||||
};
|
||||
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size.lane_size());
|
||||
let rn = show_vreg_vector(rn, mb_rru, size);
|
||||
format!("{} {}, {}", op, rd, rn)
|
||||
}
|
||||
&Inst::VecShiftImm { op, rd, rn, size, imm } => {
|
||||
let op = match op {
|
||||
VecShiftImmOp::Shl => "shl",
|
||||
VecShiftImmOp::Ushr => "ushr",
|
||||
VecShiftImmOp::Sshr => "sshr",
|
||||
};
|
||||
let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
|
||||
let rn = show_vreg_vector(rn, mb_rru, size);
|
||||
format!("{} {}, {}, #{}", op, rd, rn, imm)
|
||||
}
|
||||
&Inst::VecExtract { rd, rn, rm, imm4 } => {
|
||||
let rd = show_vreg_vector(rd.to_reg(), mb_rru, VectorSize::Size8x16);
|
||||
let rn = show_vreg_vector(rn, mb_rru, VectorSize::Size8x16);
|
||||
let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
|
||||
format!("ext {}, {}, {}, #{}", rd, rn, rm, imm4)
|
||||
}
|
||||
&Inst::VecTbl {
|
||||
rd,
|
||||
rn,
|
||||
|
||||
@@ -2060,6 +2060,197 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
normalize_bool_result(ctx, insn, rd);
|
||||
}
|
||||
|
||||
Opcode::VhighBits => {
|
||||
let dst_r = get_output_reg(ctx, outputs[0]);
|
||||
let src_v = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let ty = ctx.input_ty(insn, 0);
|
||||
// All three sequences use one integer temporary and two vector temporaries. The
|
||||
// shift is done early so as to give the register allocator the possibility of using
|
||||
// the same reg for `tmp_v1` and `src_v` in the case that this is the last use of
|
||||
// `src_v`. See https://github.com/WebAssembly/simd/pull/201 for the background and
|
||||
// derivation of these sequences. Alternative sequences are discussed in
|
||||
// https://github.com/bytecodealliance/wasmtime/issues/2296, although they are not
|
||||
// used here.
|
||||
// Also .. FIXME: when https://github.com/bytecodealliance/wasmtime/pull/2310 is
|
||||
// merged, use `lower_splat_constant` instead to generate the constants.
|
||||
let tmp_r0 = ctx.alloc_tmp(RegClass::I64, I64);
|
||||
let tmp_v0 = ctx.alloc_tmp(RegClass::V128, I8X16);
|
||||
let tmp_v1 = ctx.alloc_tmp(RegClass::V128, I8X16);
|
||||
match ty {
|
||||
I8X16 => {
|
||||
// sshr tmp_v1.16b, src_v.16b, #7
|
||||
// mov tmp_r0, #0x0201
|
||||
// movk tmp_r0, #0x0804, lsl 16
|
||||
// movk tmp_r0, #0x2010, lsl 32
|
||||
// movk tmp_r0, #0x8040, lsl 48
|
||||
// dup tmp_v0.2d, tmp_r0
|
||||
// and tmp_v1.16b, tmp_v1.16b, tmp_v0.16b
|
||||
// ext tmp_v0.16b, tmp_v1.16b, tmp_v1.16b, #8
|
||||
// zip1 tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
|
||||
// addv tmp_v0h, tmp_v0.8h
|
||||
// mov dst_r, tmp_v0.h[0]
|
||||
ctx.emit(Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Sshr,
|
||||
rd: tmp_v1,
|
||||
rn: src_v,
|
||||
size: VectorSize::Size8x16,
|
||||
imm: 7,
|
||||
});
|
||||
lower_constant_u64(ctx, tmp_r0, 0x8040201008040201u64);
|
||||
ctx.emit(Inst::VecDup {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::And,
|
||||
rd: tmp_v1,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size8x16,
|
||||
});
|
||||
ctx.emit(Inst::VecExtract {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v1.to_reg(),
|
||||
imm4: 8,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Zip1,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size8x16,
|
||||
});
|
||||
ctx.emit(Inst::VecLanes {
|
||||
op: VecLanesOp::Addv,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
ctx.emit(Inst::MovFromVec {
|
||||
rd: dst_r,
|
||||
rn: tmp_v0.to_reg(),
|
||||
idx: 0,
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
}
|
||||
I16X8 => {
|
||||
// sshr tmp_v1.8h, src_v.8h, #15
|
||||
// mov tmp_r0, #0x1
|
||||
// movk tmp_r0, #0x2, lsl 16
|
||||
// movk tmp_r0, #0x4, lsl 32
|
||||
// movk tmp_r0, #0x8, lsl 48
|
||||
// dup tmp_v0.2d, tmp_r0
|
||||
// shl tmp_r0, tmp_r0, #4
|
||||
// mov tmp_v0.d[1], tmp_r0
|
||||
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
|
||||
// addv tmp_v0h, tmp_v0.8h
|
||||
// mov dst_r, tmp_v0.h[0]
|
||||
ctx.emit(Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Sshr,
|
||||
rd: tmp_v1,
|
||||
rn: src_v,
|
||||
size: VectorSize::Size16x8,
|
||||
imm: 15,
|
||||
});
|
||||
lower_constant_u64(ctx, tmp_r0, 0x0008000400020001u64);
|
||||
ctx.emit(Inst::VecDup {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::AluRRImmShift {
|
||||
alu_op: ALUOp::Lsl64,
|
||||
rd: tmp_r0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
immshift: ImmShift { imm: 4 },
|
||||
});
|
||||
ctx.emit(Inst::MovToVec {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
idx: 1,
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::And,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size8x16,
|
||||
});
|
||||
ctx.emit(Inst::VecLanes {
|
||||
op: VecLanesOp::Addv,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
ctx.emit(Inst::MovFromVec {
|
||||
rd: dst_r,
|
||||
rn: tmp_v0.to_reg(),
|
||||
idx: 0,
|
||||
size: VectorSize::Size16x8,
|
||||
});
|
||||
}
|
||||
I32X4 => {
|
||||
// sshr tmp_v1.4s, src_v.4s, #31
|
||||
// mov tmp_r0, #0x1
|
||||
// movk tmp_r0, #0x2, lsl 32
|
||||
// dup tmp_v0.2d, tmp_r0
|
||||
// shl tmp_r0, tmp_r0, #2
|
||||
// mov tmp_v0.d[1], tmp_r0
|
||||
// and tmp_v0.16b, tmp_v1.16b, tmp_v0.16b
|
||||
// addv tmp_v0s, tmp_v0.4s
|
||||
// mov dst_r, tmp_v0.s[0]
|
||||
ctx.emit(Inst::VecShiftImm {
|
||||
op: VecShiftImmOp::Sshr,
|
||||
rd: tmp_v1,
|
||||
rn: src_v,
|
||||
size: VectorSize::Size32x4,
|
||||
imm: 31,
|
||||
});
|
||||
lower_constant_u64(ctx, tmp_r0, 0x0000000200000001u64);
|
||||
ctx.emit(Inst::VecDup {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::AluRRImmShift {
|
||||
alu_op: ALUOp::Lsl64,
|
||||
rd: tmp_r0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
immshift: ImmShift { imm: 2 },
|
||||
});
|
||||
ctx.emit(Inst::MovToVec {
|
||||
rd: tmp_v0,
|
||||
rn: tmp_r0.to_reg(),
|
||||
idx: 1,
|
||||
size: VectorSize::Size64x2,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::And,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v1.to_reg(),
|
||||
rm: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size8x16,
|
||||
});
|
||||
ctx.emit(Inst::VecLanes {
|
||||
op: VecLanesOp::Addv,
|
||||
rd: tmp_v0,
|
||||
rn: tmp_v0.to_reg(),
|
||||
size: VectorSize::Size32x4,
|
||||
});
|
||||
ctx.emit(Inst::MovFromVec {
|
||||
rd: dst_r,
|
||||
rn: tmp_v0.to_reg(),
|
||||
idx: 0,
|
||||
size: VectorSize::Size32x4,
|
||||
});
|
||||
}
|
||||
_ => panic!("arm64 isel: VhighBits unhandled, ty = {:?}", ty),
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Shuffle => {
|
||||
let mask = const_param_to_u128(ctx, insn).expect("Invalid immediate mask bytes");
|
||||
let rd = get_output_reg(ctx, outputs[0]);
|
||||
|
||||
Binary file not shown.
@@ -1600,6 +1600,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
||||
let bool_result = builder.ins().vall_true(a);
|
||||
state.push1(builder.ins().bint(I32, bool_result))
|
||||
}
|
||||
Operator::I8x16Bitmask | Operator::I16x8Bitmask | Operator::I32x4Bitmask => {
|
||||
let a = pop1_with_bitcast(state, type_of(op), builder);
|
||||
state.push1(builder.ins().vhigh_bits(I32, a));
|
||||
}
|
||||
Operator::I8x16Eq | Operator::I16x8Eq | Operator::I32x4Eq => {
|
||||
translate_vector_icmp(IntCC::Equal, type_of(op), builder, state)
|
||||
}
|
||||
@@ -1763,10 +1767,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
|
||||
| Operator::F64x2Trunc
|
||||
| Operator::F64x2PMin
|
||||
| Operator::F64x2PMax
|
||||
| Operator::F64x2Nearest
|
||||
| Operator::I8x16Bitmask
|
||||
| Operator::I16x8Bitmask
|
||||
| Operator::I32x4Bitmask => {
|
||||
| Operator::F64x2Nearest => {
|
||||
return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user