x64: Lower shuffle and swizzle in ISLE (#4772)
Lower `shuffle` and `swizzle` in ISLE.
This PR surfaced a bug with the lowering of `shuffle` when avx512vl and avx512vbmi are enabled: we use `vpermi2b` as the implementation, but panic if the immediate shuffle mask contains any out-of-bounds values. The behavior when the avx512 extensions are not present is that out-of-bounds values are turned into `0` in the result.
I've resolved this by detecting when the shuffle immediate has out-of-bounds indices in the avx512-enabled lowering, and generating an additional mask to zero out the lanes where those indices occur. This brings the avx512 case into line with the semantics of the `shuffle` op: 94bcbe8446/cranelift/codegen/meta/src/shared/instructions.rs (L1495-L1498)
This commit is contained in:
@@ -3,7 +3,6 @@
|
||||
// ISLE integration glue.
|
||||
pub(super) mod isle;
|
||||
|
||||
use crate::data_value::DataValue;
|
||||
use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type};
|
||||
use crate::isa::x64::abi::*;
|
||||
use crate::isa::x64::inst::args::*;
|
||||
@@ -585,139 +584,14 @@ fn lower_insn_to_regs(
|
||||
| Opcode::SetPinnedReg
|
||||
| Opcode::Vconst
|
||||
| Opcode::RawBitcast
|
||||
| Opcode::Insertlane => {
|
||||
| Opcode::Insertlane
|
||||
| Opcode::Shuffle
|
||||
| Opcode::Swizzle => {
|
||||
implemented_in_isle(ctx);
|
||||
}
|
||||
|
||||
Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
|
||||
|
||||
Opcode::Shuffle => {
|
||||
let ty = ty.unwrap();
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let lhs_ty = ctx.input_ty(insn, 0);
|
||||
let lhs = put_input_in_reg(ctx, inputs[0]);
|
||||
let rhs = put_input_in_reg(ctx, inputs[1]);
|
||||
let mask = match ctx.get_immediate(insn) {
|
||||
Some(DataValue::V128(bytes)) => bytes.to_vec(),
|
||||
_ => unreachable!("shuffle should always have a 16-byte immediate"),
|
||||
};
|
||||
|
||||
// A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
|
||||
// 1 in the most significant position zeroes the lane.
|
||||
let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
|
||||
|
||||
ctx.emit(Inst::gen_move(dst, rhs, ty));
|
||||
if rhs == lhs {
|
||||
// If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
|
||||
// register. We statically build `constructed_mask` to zero out any unknown lane
|
||||
// indices (may not be completely necessary: verification could fail incorrect mask
|
||||
// values) and fix the indexes to all point to the `dst` vector.
|
||||
let constructed_mask = mask
|
||||
.iter()
|
||||
// If the mask is greater than 15 it still may be referring to a lane in b.
|
||||
.map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
|
||||
.map(zero_unknown_lane_index)
|
||||
.collect();
|
||||
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
|
||||
let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
||||
ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
|
||||
// After loading the constructed mask in a temporary register, we use this to
|
||||
// shuffle the `dst` register (remember that, in this case, it is the same as
|
||||
// `src` so we disregard this register).
|
||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
|
||||
} else {
|
||||
if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() {
|
||||
assert!(
|
||||
mask.iter().all(|b| *b < 32),
|
||||
"shuffle mask values must be between 0 and 31"
|
||||
);
|
||||
|
||||
// Load the mask into the destination register.
|
||||
let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into()));
|
||||
ctx.emit(Inst::xmm_load_const(constant, dst, ty));
|
||||
|
||||
// VPERMI2B has the exact semantics of Wasm's shuffle:
|
||||
// permute the bytes in `src1` and `src2` using byte indexes
|
||||
// in `dst` and store the byte results in `dst`.
|
||||
ctx.emit(Inst::xmm_rm_r_evex(
|
||||
Avx512Opcode::Vpermi2b,
|
||||
RegMem::reg(rhs),
|
||||
lhs,
|
||||
dst,
|
||||
));
|
||||
} else {
|
||||
// If `lhs` and `rhs` are different, we must shuffle each separately and then OR
|
||||
// them together. This is necessary due to PSHUFB semantics. As in the case above,
|
||||
// we build the `constructed_mask` for each case statically.
|
||||
|
||||
// PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
|
||||
let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
|
||||
ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
|
||||
let constructed_mask =
|
||||
mask.iter().cloned().map(zero_unknown_lane_index).collect();
|
||||
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
|
||||
let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
||||
ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
|
||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
|
||||
|
||||
// PSHUFB the second argument, placing zeroes for unused lanes.
|
||||
let constructed_mask = mask
|
||||
.iter()
|
||||
.map(|b| b.wrapping_sub(16))
|
||||
.map(zero_unknown_lane_index)
|
||||
.collect();
|
||||
let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
|
||||
let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
||||
ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
|
||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
|
||||
|
||||
// OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
|
||||
// is not important).
|
||||
ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Opcode::Swizzle => {
|
||||
// SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec
|
||||
// requiring mask indexes greater than 15 to have the same semantics as a 0 index. For
|
||||
// the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF
|
||||
// semantics match the Wasm SIMD semantics for this instruction.
|
||||
// The instruction format maps to variables like: %dst = swizzle %src, %mask
|
||||
let ty = ty.unwrap();
|
||||
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let src = put_input_in_reg(ctx, inputs[0]);
|
||||
let swizzle_mask = put_input_in_reg(ctx, inputs[1]);
|
||||
|
||||
// Inform the register allocator that `src` and `dst` should be in the same register.
|
||||
ctx.emit(Inst::gen_move(dst, src, ty));
|
||||
|
||||
// Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
|
||||
let zero_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
||||
static ZERO_MASK_VALUE: [u8; 16] = [
|
||||
0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
|
||||
0x70, 0x70,
|
||||
];
|
||||
let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE));
|
||||
ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty));
|
||||
|
||||
// Use the `zero_mask` on a writable `swizzle_mask`.
|
||||
let swizzle_mask_tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
|
||||
ctx.emit(Inst::gen_move(swizzle_mask_tmp, swizzle_mask, ty));
|
||||
ctx.emit(Inst::xmm_rm_r(
|
||||
SseOpcode::Paddusb,
|
||||
RegMem::from(zero_mask),
|
||||
swizzle_mask_tmp,
|
||||
));
|
||||
|
||||
// Shuffle `dst` using the fixed-up `swizzle_mask`.
|
||||
ctx.emit(Inst::xmm_rm_r(
|
||||
SseOpcode::Pshufb,
|
||||
RegMem::from(swizzle_mask_tmp),
|
||||
dst,
|
||||
));
|
||||
}
|
||||
|
||||
Opcode::Extractlane => {
|
||||
// The instruction format maps to variables like: %dst = extractlane %src, %lane
|
||||
let ty = ty.unwrap();
|
||||
|
||||
Reference in New Issue
Block a user