s390x: Implement full SIMD support (#4427)

This adds full support for all Cranelift SIMD instructions to the s390x target. Everything is matched fully via ISLE. In addition to adding support for many new instructions, and the lower.isle code to match all SIMD IR patterns, this patch also adds ABI support for vector types. In particular, we now need to handle the fact that vector registers 8 .. 15 are partially callee-saved, i.e. the high parts of those registers (which correspond to the old floating-poing registers) are callee-saved, but the low parts are not. This is the exact same situation that we already have on AArch64, and so this patch uses the same solution (the is_included_in_clobbers callback). The bulk of the changes are platform-specific, but there are a few exceptions: - Added ISLE extractors for the Immediate and Constant types, to enable matching the vconst and swizzle instructions. - Added a missing accessor for call_conv to ABISig. - Fixed endian conversion for vector types in data_value.rs to enable their use in runtests on the big-endian platforms. - Enabled (nearly) all SIMD runtests on s390x. [ Two test cases remain disabled due to vector shift count semantics, see below. ] - Enabled all Wasmtime SIMD tests on s390x. There are three minor issues, called out via FIXMEs below, which should be addressed in the future, but should not be blockers to getting this patch merged. I've opened the following issues to track them: - Vector shift count semantics https://github.com/bytecodealliance/wasmtime/issues/4424 - is_included_in_clobbers vs. link register https://github.com/bytecodealliance/wasmtime/issues/4425 - gen_constant callback https://github.com/bytecodealliance/wasmtime/issues/4426 All tests, including all newly enabled SIMD tests, pass on both z14 and z15 architectures.
2022-07-18 23:00:48 +02:00
parent e5678e8f8d
commit 638dc4e0b3
75 changed files with 17839 additions and 1744 deletions
--- a/cranelift/codegen/src/data_value.rs
+++ b/cranelift/codegen/src/data_value.rs
@@ -89,7 +89,7 @@ impl DataValue {
            DataValue::I128(i) => dst[..16].copy_from_slice(&i.to_ne_bytes()[..]),
            DataValue::F32(f) => dst[..4].copy_from_slice(&f.bits().to_ne_bytes()[..]),
            DataValue::F64(f) => dst[..8].copy_from_slice(&f.bits().to_ne_bytes()[..]),
-            DataValue::V128(v) => dst[..16].copy_from_slice(&v[..]),
+            DataValue::V128(v) => dst[..16].copy_from_slice(&u128::from_le_bytes(*v).to_ne_bytes()),
            _ => unimplemented!(),
        };
    }
@@ -120,7 +120,7 @@ impl DataValue {
                DataValue::B(src[..size].iter().any(|&i| i != 0))
            }
            _ if ty.is_vector() && ty.bytes() == 16 => {
-                DataValue::V128(src[..16].try_into().unwrap())
+                DataValue::V128(u128::from_ne_bytes(src[..16].try_into().unwrap()).to_le_bytes())
            }
            _ => unimplemented!(),
        }
--- a/cranelift/codegen/src/isa/s390x/abi.rs
+++ b/cranelift/codegen/src/isa/s390x/abi.rs
@@ -97,6 +97,10 @@ fn in_flt_reg(ty: Type) -> bool {
    }
 }

+fn in_vec_reg(ty: Type) -> bool {
+    ty.is_vector() && ty.bits() == 128
+}
+
 fn get_intreg_for_arg(idx: usize) -> Option<Reg> {
    match idx {
        0 => Some(regs::gpr(2)),
@@ -118,6 +122,20 @@ fn get_fltreg_for_arg(idx: usize) -> Option<Reg> {
    }
 }

+fn get_vecreg_for_arg(idx: usize) -> Option<Reg> {
+    match idx {
+        0 => Some(regs::vr(24)),
+        1 => Some(regs::vr(25)),
+        2 => Some(regs::vr(26)),
+        3 => Some(regs::vr(27)),
+        4 => Some(regs::vr(28)),
+        5 => Some(regs::vr(29)),
+        6 => Some(regs::vr(30)),
+        7 => Some(regs::vr(31)),
+        _ => None,
+    }
+}
+
 fn get_intreg_for_ret(idx: usize) -> Option<Reg> {
    match idx {
        0 => Some(regs::gpr(2)),
@@ -140,6 +158,21 @@ fn get_fltreg_for_ret(idx: usize) -> Option<Reg> {
    }
 }

+fn get_vecreg_for_ret(idx: usize) -> Option<Reg> {
+    match idx {
+        0 => Some(regs::vr(24)),
+        // ABI extension to support multi-value returns:
+        1 => Some(regs::vr(25)),
+        2 => Some(regs::vr(26)),
+        3 => Some(regs::vr(27)),
+        4 => Some(regs::vr(28)),
+        5 => Some(regs::vr(29)),
+        6 => Some(regs::vr(30)),
+        7 => Some(regs::vr(31)),
+        _ => None,
+    }
+}
+
 /// This is the limit for the size of argument and return-value areas on the
 /// stack. We place a reasonable limit here to avoid integer overflow issues
 /// with 32-bit arithmetic: for now, 128 MB.
@@ -182,6 +215,7 @@ impl ABIMachineSpec for S390xMachineDeps {
    ) -> CodegenResult<(Vec<ABIArg>, i64, Option<usize>)> {
        let mut next_gpr = 0;
        let mut next_fpr = 0;
+        let mut next_vr = 0;
        let mut next_stack: u64 = 0;
        let mut ret = vec![];

@@ -206,8 +240,8 @@ impl ABIMachineSpec for S390xMachineDeps {

            let intreg = in_int_reg(param.value_type);
            let fltreg = in_flt_reg(param.value_type);
-            debug_assert!(intreg || fltreg);
-            debug_assert!(!(intreg && fltreg));
+            let vecreg = in_vec_reg(param.value_type);
+            debug_assert!(intreg as i32 + fltreg as i32 + vecreg as i32 == 1);

            let (next_reg, candidate) = if intreg {
                let candidate = match args_or_rets {
@@ -215,12 +249,18 @@ impl ABIMachineSpec for S390xMachineDeps {
                    ArgsOrRets::Rets => get_intreg_for_ret(next_gpr),
                };
                (&mut next_gpr, candidate)
-            } else {
+            } else if fltreg {
                let candidate = match args_or_rets {
                    ArgsOrRets::Args => get_fltreg_for_arg(next_fpr),
                    ArgsOrRets::Rets => get_fltreg_for_ret(next_fpr),
                };
                (&mut next_fpr, candidate)
+            } else {
+                let candidate = match args_or_rets {
+                    ArgsOrRets::Args => get_vecreg_for_arg(next_vr),
+                    ArgsOrRets::Rets => get_vecreg_for_ret(next_vr),
+                };
+                (&mut next_vr, candidate)
            };

            // In the Wasmtime ABI only the first return value can be in a register.
@@ -252,7 +292,8 @@ impl ABIMachineSpec for S390xMachineDeps {

                // Align the stack slot.
                debug_assert!(slot_size.is_power_of_two());
-                next_stack = align_to(next_stack, slot_size);
+                let slot_align = std::cmp::min(slot_size, 8);
+                next_stack = align_to(next_stack, slot_align);

                // If the type is actually of smaller size (and the argument
                // was not extended), it is passed right-aligned.
@@ -477,6 +518,13 @@ impl ABIMachineSpec for S390xMachineDeps {
                RegClass::Float => clobbered_fpr.push(reg),
            }
        }
+        // We need to save the link register in non-leaf functions.
+        // FIXME: This should be included in the clobber list to begin with,
+        // but isn't because we have have excluded call instructions via the
+        // is_included_in_clobbers callback.
+        if outgoing_args_size > 0 {
+            clobbered_gpr.push(Writable::from_reg(RealReg::from(gpr_preg(14))));
+        }

        let mut first_clobbered_gpr = 16;
        for reg in clobbered_gpr {
@@ -534,13 +582,15 @@ impl ABIMachineSpec for S390xMachineDeps {

        // Save FPRs.
        for (i, reg) in clobbered_fpr.iter().enumerate() {
-            insts.push(Inst::FpuStore64 {
+            insts.push(Inst::VecStoreLane {
+                size: 64,
                rd: reg.to_reg().into(),
                mem: MemArg::reg_plus_off(
                    stack_reg(),
                    (i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64,
                    MemFlags::trusted(),
                ),
+                lane_imm: 0,
            });
            if flags.unwind_info() {
                insts.push(Inst::Unwind {
@@ -566,7 +616,14 @@ impl ABIMachineSpec for S390xMachineDeps {
        let mut insts = SmallVec::new();

        // Collect clobbered registers.
-        let (clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers);
+        let (mut clobbered_gpr, clobbered_fpr) = get_regs_saved_in_prologue(call_conv, clobbers);
+        // We need to restore the link register in non-leaf functions.
+        // FIXME: This should be included in the clobber list to begin with,
+        // but isn't because we have have excluded call instructions via the
+        // is_included_in_clobbers callback.
+        if outgoing_args_size > 0 {
+            clobbered_gpr.push(Writable::from_reg(RealReg::from(gpr_preg(14))));
+        }
        let mut first_clobbered_gpr = 16;
        for reg in clobbered_gpr {
            let enc = reg.to_reg().hw_enc();
@@ -578,13 +635,15 @@ impl ABIMachineSpec for S390xMachineDeps {

        // Restore FPRs.
        for (i, reg) in clobbered_fpr.iter().enumerate() {
-            insts.push(Inst::FpuLoad64 {
+            insts.push(Inst::VecLoadLaneUndef {
+                size: 64,
                rd: Writable::from_reg(reg.to_reg().into()),
                mem: MemArg::reg_plus_off(
                    stack_reg(),
                    (i * 8) as i64 + outgoing_args_size as i64 + fixed_frame_storage_size as i64,
                    MemFlags::trusted(),
                ),
+                lane_imm: 0,
            });
        }

@@ -639,7 +698,7 @@ impl ABIMachineSpec for S390xMachineDeps {
        // We allocate in terms of 8-byte slots.
        match rc {
            RegClass::Int => 1,
-            RegClass::Float => 1,
+            RegClass::Float => 2,
        }
    }

@@ -739,6 +798,21 @@ const fn clobbers() -> PRegSet {
        .with(gpr_preg(3))
        .with(gpr_preg(4))
        .with(gpr_preg(5))
+        // v0 - v7 inclusive and v16 - v31 inclusive are
+        // caller-saves. The upper 64 bits of v8 - v15 inclusive are
+        // also caller-saves.  However, because we cannot currently
+        // represent partial registers to regalloc2, we indicate here
+        // that every vector register is caller-save. Because this
+        // function is used at *callsites*, approximating in this
+        // direction (save more than necessary) is conservative and
+        // thus safe.
+        //
+        // Note that we exclude clobbers from a call instruction when
+        // a call instruction's callee has the same ABI as the caller
+        // (the current function body); this is safe (anything
+        // clobbered by callee can be clobbered by caller as well) and
+        // avoids unnecessary saves of v8-v15 in the prologue even
+        // though we include them as defs here.
        .with(vr_preg(0))
        .with(vr_preg(1))
        .with(vr_preg(2))
@@ -747,6 +821,14 @@ const fn clobbers() -> PRegSet {
        .with(vr_preg(5))
        .with(vr_preg(6))
        .with(vr_preg(7))
+        .with(vr_preg(8))
+        .with(vr_preg(9))
+        .with(vr_preg(10))
+        .with(vr_preg(11))
+        .with(vr_preg(12))
+        .with(vr_preg(13))
+        .with(vr_preg(14))
+        .with(vr_preg(15))
        .with(vr_preg(16))
        .with(vr_preg(17))
        .with(vr_preg(18))
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
--- a/cranelift/codegen/src/isa/s390x/inst/emit.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs
--- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
--- a/cranelift/codegen/src/isa/s390x/inst/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
--- a/cranelift/codegen/src/isa/s390x/lower.rs
+++ b/cranelift/codegen/src/isa/s390x/lower.rs
@@ -43,15 +43,28 @@ impl LowerBackend for S390xBackend {
            | Opcode::Bconst
            | Opcode::F32const
            | Opcode::F64const
+            | Opcode::Vconst
            | Opcode::Null
            | Opcode::Iadd
            | Opcode::IaddIfcout
            | Opcode::Isub
+            | Opcode::UaddSat
+            | Opcode::SaddSat
+            | Opcode::UsubSat
+            | Opcode::SsubSat
+            | Opcode::IaddPairwise
+            | Opcode::Imin
+            | Opcode::Umin
+            | Opcode::Imax
+            | Opcode::Umax
+            | Opcode::AvgRound
            | Opcode::Iabs
            | Opcode::Ineg
            | Opcode::Imul
            | Opcode::Umulhi
            | Opcode::Smulhi
+            | Opcode::WideningPairwiseDotProductS
+            | Opcode::SqmulRoundSat
            | Opcode::Udiv
            | Opcode::Urem
            | Opcode::Sdiv
@@ -64,6 +77,13 @@ impl LowerBackend for S390xBackend {
            | Opcode::Ireduce
            | Opcode::Uextend
            | Opcode::Sextend
+            | Opcode::Snarrow
+            | Opcode::Unarrow
+            | Opcode::Uunarrow
+            | Opcode::SwidenLow
+            | Opcode::SwidenHigh
+            | Opcode::UwidenLow
+            | Opcode::UwidenHigh
            | Opcode::Bnot
            | Opcode::Band
            | Opcode::Bor
@@ -72,6 +92,7 @@ impl LowerBackend for S390xBackend {
            | Opcode::BorNot
            | Opcode::BxorNot
            | Opcode::Bitselect
+            | Opcode::Vselect
            | Opcode::Breduce
            | Opcode::Bextend
            | Opcode::Bmask
@@ -86,11 +107,15 @@ impl LowerBackend for S390xBackend {
            | Opcode::Fdiv
            | Opcode::Fmin
            | Opcode::Fmax
+            | Opcode::FminPseudo
+            | Opcode::FmaxPseudo
            | Opcode::Sqrt
            | Opcode::Fneg
            | Opcode::Fabs
            | Opcode::Fpromote
            | Opcode::Fdemote
+            | Opcode::FvpromoteLow
+            | Opcode::Fvdemote
            | Opcode::Ceil
            | Opcode::Floor
            | Opcode::Trunc
@@ -99,11 +124,20 @@ impl LowerBackend for S390xBackend {
            | Opcode::Fcopysign
            | Opcode::FcvtFromUint
            | Opcode::FcvtFromSint
+            | Opcode::FcvtLowFromSint
            | Opcode::FcvtToUint
            | Opcode::FcvtToSint
            | Opcode::FcvtToUintSat
            | Opcode::FcvtToSintSat
+            | Opcode::Splat
+            | Opcode::Swizzle
+            | Opcode::Shuffle
+            | Opcode::Insertlane
+            | Opcode::Extractlane
+            | Opcode::ScalarToVector
+            | Opcode::VhighBits
            | Opcode::Bitcast
+            | Opcode::RawBitcast
            | Opcode::Load
            | Opcode::Uload8
            | Opcode::Sload8
@@ -111,6 +145,12 @@ impl LowerBackend for S390xBackend {
            | Opcode::Sload16
            | Opcode::Uload32
            | Opcode::Sload32
+            | Opcode::Uload8x8
+            | Opcode::Sload8x8
+            | Opcode::Uload16x4
+            | Opcode::Sload16x4
+            | Opcode::Uload32x2
+            | Opcode::Sload32x2
            | Opcode::Store
            | Opcode::Istore8
            | Opcode::Istore16
@@ -122,6 +162,8 @@ impl LowerBackend for S390xBackend {
            | Opcode::Fence
            | Opcode::Icmp
            | Opcode::Fcmp
+            | Opcode::VanyTrue
+            | Opcode::VallTrue
            | Opcode::IsNull
            | Opcode::IsInvalid
            | Opcode::Select
@@ -147,57 +189,15 @@ impl LowerBackend for S390xBackend {
                )
            }

-            Opcode::UaddSat
-            | Opcode::SaddSat
-            | Opcode::UsubSat
-            | Opcode::SsubSat
-            | Opcode::Bitrev
-            | Opcode::FcvtLowFromSint
+            Opcode::Bitrev
            | Opcode::ConstAddr
            | Opcode::TlsValue
            | Opcode::GetPinnedReg
            | Opcode::SetPinnedReg
            | Opcode::Isplit
            | Opcode::Iconcat
-            | Opcode::RawBitcast
-            | Opcode::Splat
-            | Opcode::Swizzle
-            | Opcode::Insertlane
-            | Opcode::Extractlane
-            | Opcode::Imin
-            | Opcode::Umin
-            | Opcode::Imax
-            | Opcode::Umax
-            | Opcode::AvgRound
-            | Opcode::FminPseudo
-            | Opcode::FmaxPseudo
-            | Opcode::Uload8x8
-            | Opcode::Sload8x8
-            | Opcode::Uload16x4
-            | Opcode::Sload16x4
-            | Opcode::Uload32x2
-            | Opcode::Sload32x2
-            | Opcode::Vconst
-            | Opcode::Shuffle
            | Opcode::Vsplit
            | Opcode::Vconcat
-            | Opcode::Vselect
-            | Opcode::VanyTrue
-            | Opcode::VallTrue
-            | Opcode::VhighBits
-            | Opcode::ScalarToVector
-            | Opcode::Snarrow
-            | Opcode::Unarrow
-            | Opcode::Uunarrow
-            | Opcode::SwidenLow
-            | Opcode::SwidenHigh
-            | Opcode::UwidenLow
-            | Opcode::UwidenHigh
-            | Opcode::WideningPairwiseDotProductS
-            | Opcode::SqmulRoundSat
-            | Opcode::FvpromoteLow
-            | Opcode::Fvdemote
-            | Opcode::IaddPairwise
            | Opcode::DynamicStackLoad
            | Opcode::DynamicStackStore
            | Opcode::DynamicStackAddr
--- a/cranelift/codegen/src/isa/s390x/lower/isle.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs
@@ -6,7 +6,7 @@ pub mod generated_code;
 // Types that the generated ISLE code uses via `use super::*`.
 use crate::isa::s390x::abi::S390xMachineDeps;
 use crate::isa::s390x::inst::{
-    stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg,
+    stack_reg, writable_gpr, zero_reg, CallIndInfo, CallInfo, Cond, Inst as MInst, MemArg, UImm12,
    UImm16Shifted, UImm32Shifted,
 };
 use crate::isa::s390x::settings::Flags as IsaFlags;
@@ -91,6 +91,8 @@ where
            defs,
            clobbers,
            opcode: *opcode,
+            caller_callconv: self.lower_ctx.abi().call_conv(),
+            callee_callconv: abi.call_conv(),
        })
    }

@@ -102,6 +104,8 @@ where
            defs,
            clobbers,
            opcode: *opcode,
+            caller_callconv: self.lower_ctx.abi().call_conv(),
+            callee_callconv: abi.call_conv(),
        })
    }

@@ -195,6 +199,46 @@ where
        }
    }

+    #[inline]
+    fn u64_pair_split(&mut self, n: u128) -> (u64, u64) {
+        ((n >> 64) as u64, n as u64)
+    }
+
+    #[inline]
+    fn u64_pair_concat(&mut self, hi: u64, lo: u64) -> u128 {
+        (hi as u128) << 64 | (lo as u128)
+    }
+
+    #[inline]
+    fn u32_pair_split(&mut self, n: u64) -> (u32, u32) {
+        ((n >> 32) as u32, n as u32)
+    }
+
+    #[inline]
+    fn u32_pair_concat(&mut self, hi: u32, lo: u32) -> u64 {
+        (hi as u64) << 32 | (lo as u64)
+    }
+
+    #[inline]
+    fn u16_pair_split(&mut self, n: u32) -> (u16, u16) {
+        ((n >> 16) as u16, n as u16)
+    }
+
+    #[inline]
+    fn u16_pair_concat(&mut self, hi: u16, lo: u16) -> u32 {
+        (hi as u32) << 16 | (lo as u32)
+    }
+
+    #[inline]
+    fn u8_pair_split(&mut self, n: u16) -> (u8, u8) {
+        ((n >> 8) as u8, n as u8)
+    }
+
+    #[inline]
+    fn u8_pair_concat(&mut self, hi: u8, lo: u8) -> u16 {
+        (hi as u16) << 8 | (lo as u16)
+    }
+
    #[inline]
    fn u8_as_u16(&mut self, n: u8) -> u16 {
        n as u16
@@ -248,6 +292,15 @@ where
        }
    }

+    #[inline]
+    fn i16_from_u32(&mut self, n: u32) -> Option<i16> {
+        if let Ok(imm) = i16::try_from(n as i32) {
+            Some(imm)
+        } else {
+            None
+        }
+    }
+
    #[inline]
    fn uimm32shifted_from_u64(&mut self, n: u64) -> Option<UImm32Shifted> {
        UImm32Shifted::maybe_from_u64(n)
@@ -258,11 +311,49 @@ where
        UImm16Shifted::maybe_from_u64(n)
    }

+    #[inline]
+    fn be_lane_idx(&mut self, ty: Type, idx: u8) -> u8 {
+        ty.lane_count() as u8 - 1 - idx
+    }
+
+    #[inline]
+    fn lane_byte_mask(&mut self, ty: Type, idx: u8) -> u16 {
+        let lane_bytes = (ty.lane_bits() / 8) as u8;
+        let lane_mask = (1u16 << lane_bytes) - 1;
+        lane_mask << (16 - ((idx + 1) * lane_bytes))
+    }
+
+    #[inline]
+    fn shuffle_mask_from_u128(&mut self, idx: u128) -> (u128, u16) {
+        let bytes = idx.to_be_bytes();
+        let and_mask = bytes.iter().fold(0, |acc, &x| (acc << 1) | (x < 32) as u16);
+        let bytes = bytes.map(|x| {
+            if x < 16 {
+                15 - x
+            } else if x < 32 {
+                47 - x
+            } else {
+                128
+            }
+        });
+        let permute_mask = u128::from_be_bytes(bytes);
+        (permute_mask, and_mask)
+    }
+
    #[inline]
    fn u64_from_value(&mut self, val: Value) -> Option<u64> {
        let inst = self.lower_ctx.dfg().value_def(val).inst()?;
        let constant = self.lower_ctx.get_constant(inst)?;
-        Some(constant)
+        let ty = self.lower_ctx.output_ty(inst, 0);
+        Some(zero_extend_to_u64(constant, self.ty_bits(ty).unwrap()))
+    }
+
+    #[inline]
+    fn u64_from_inverted_value(&mut self, val: Value) -> Option<u64> {
+        let inst = self.lower_ctx.dfg().value_def(val).inst()?;
+        let constant = self.lower_ctx.get_constant(inst)?;
+        let ty = self.lower_ctx.output_ty(inst, 0);
+        Some(zero_extend_to_u64(!constant, self.ty_bits(ty).unwrap()))
    }

    #[inline]
@@ -349,22 +440,22 @@ where

    #[inline]
    fn uimm16shifted_from_inverted_value(&mut self, val: Value) -> Option<UImm16Shifted> {
-        let constant = self.u64_from_value(val)?;
-        let imm = UImm16Shifted::maybe_from_u64(!constant)?;
+        let constant = self.u64_from_inverted_value(val)?;
+        let imm = UImm16Shifted::maybe_from_u64(constant)?;
        Some(imm.negate_bits())
    }

    #[inline]
    fn uimm32shifted_from_inverted_value(&mut self, val: Value) -> Option<UImm32Shifted> {
-        let constant = self.u64_from_value(val)?;
-        let imm = UImm32Shifted::maybe_from_u64(!constant)?;
+        let constant = self.u64_from_inverted_value(val)?;
+        let imm = UImm32Shifted::maybe_from_u64(constant)?;
        Some(imm.negate_bits())
    }

    #[inline]
    fn mask_amt_imm(&mut self, ty: Type, amt: i64) -> u8 {
-        let mask = self.ty_bits(ty).unwrap() - 1;
-        (amt as u8) & mask
+        let mask = ty.lane_bits() - 1;
+        (amt as u8) & (mask as u8)
    }

    #[inline]
@@ -498,13 +589,18 @@ where
    }

    #[inline]
-    fn memarg_reg_plus_reg(&mut self, x: Reg, y: Reg, flags: MemFlags) -> MemArg {
-        MemArg::reg_plus_reg(x, y, flags)
+    fn memarg_reg_plus_reg(&mut self, x: Reg, y: Reg, bias: u8, flags: MemFlags) -> MemArg {
+        MemArg::BXD12 {
+            base: x,
+            index: y,
+            disp: UImm12::maybe_from_u64(bias as u64).unwrap(),
+            flags,
+        }
    }

    #[inline]
-    fn memarg_reg_plus_off(&mut self, reg: Reg, off: i64, flags: MemFlags) -> MemArg {
-        MemArg::reg_plus_off(reg, off, flags)
+    fn memarg_reg_plus_off(&mut self, reg: Reg, off: i64, bias: u8, flags: MemFlags) -> MemArg {
+        MemArg::reg_plus_off(reg, off + (bias as i64), flags)
    }

    #[inline]
@@ -586,6 +682,17 @@ where
    }
 }

+/// Zero-extend the low `from_bits` bits of `value` to a full u64.
+#[inline]
+fn zero_extend_to_u64(value: u64, from_bits: u8) -> u64 {
+    assert!(from_bits <= 64);
+    if from_bits >= 64 {
+        value
+    } else {
+        value & ((1u64 << from_bits) - 1)
+    }
+}
+
 /// Sign-extend the low `from_bits` bits of `value` to a full u64.
 #[inline]
 fn sign_extend_to_u64(value: u64, from_bits: u8) -> u64 {
--- a/cranelift/codegen/src/machinst/abi_impl.rs
+++ b/cranelift/codegen/src/machinst/abi_impl.rs
@@ -696,6 +696,11 @@ impl ABISig {
        let ret_arg = self.stack_ret_arg?;
        Some(self.args[ret_arg].clone())
    }
+
+    /// Get calling convention used.
+    pub fn call_conv(&self) -> isa::CallConv {
+        self.call_conv
+    }
 }

 /// ABI object for a function body.
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -7,7 +7,8 @@ use std::cell::Cell;

 pub use super::MachLabel;
 pub use crate::ir::{
-    ArgumentExtension, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, SigRef, StackSlot,
+    ArgumentExtension, Constant, DynamicStackSlot, ExternalName, FuncRef, GlobalValue, Immediate,
+    SigRef, StackSlot,
 };
 pub use crate::isa::unwind::UnwindInst;
 pub use crate::machinst::{ABIArg, ABIArgSlot, ABISig, RealReg, Reg, RelocDistance, Writable};
@@ -547,6 +548,18 @@ macro_rules! isle_prelude_methods {
            }
        }

+        #[inline]
+        fn u128_from_immediate(&mut self, imm: Immediate) -> Option<u128> {
+            let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
+            Some(u128::from_le_bytes(bytes.try_into().ok()?))
+        }
+
+        #[inline]
+        fn u128_from_constant(&mut self, constant: Constant) -> Option<u128> {
+            let bytes = self.lower_ctx.get_constant_data(constant).as_slice();
+            Some(u128::from_le_bytes(bytes.try_into().ok()?))
+        }
+
        fn nonzero_u64_from_imm64(&mut self, val: Imm64) -> Option<u64> {
            match val.bits() {
                0 => None,
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -12,8 +12,8 @@ use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit};
 use crate::ir::{
    types::{FFLAGS, IFLAGS},
    ArgumentPurpose, Block, Constant, ConstantData, DataFlowGraph, ExternalName, Function,
-    GlobalValue, GlobalValueData, Inst, InstructionData, MemFlags, Opcode, Signature, SourceLoc,
-    Type, Value, ValueDef, ValueLabelAssignments, ValueLabelStart,
+    GlobalValue, GlobalValueData, Immediate, Inst, InstructionData, MemFlags, Opcode, Signature,
+    SourceLoc, Type, Value, ValueDef, ValueLabelAssignments, ValueLabelStart,
 };
 use crate::machinst::{
    non_writable_value_regs, writable_value_regs, ABICallee, BlockIndex, BlockLoweringOrder,
@@ -167,6 +167,8 @@ pub trait LowerCtx {
    /// for the input produced by the sunk instruction), otherwise the
    /// side-effect will occur twice.
    fn sink_inst(&mut self, ir_inst: Inst);
+    /// Retrieve immediate data given a handle.
+    fn get_immediate_data(&self, imm: Immediate) -> &ConstantData;
    /// Retrieve constant data given a handle.
    fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData;
    /// Indicate that a constant should be emitted.
@@ -1448,6 +1450,10 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {
        self.inst_sunk.insert(ir_inst);
    }

+    fn get_immediate_data(&self, imm: Immediate) -> &ConstantData {
+        self.f.dfg.immediates.get(imm).unwrap()
+    }
+
    fn get_constant_data(&self, constant_handle: Constant) -> &ConstantData {
        self.f.dfg.constants.get(constant_handle)
    }
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -661,6 +661,17 @@
 (decl reloc_distance_near () RelocDistance)
 (extern extractor reloc_distance_near reloc_distance_near)

+;; Accessor for `Immediate` as u128.
+
+(decl u128_from_immediate (u128) Immediate)
+(extern extractor u128_from_immediate u128_from_immediate)
+
+;; Accessor for `Constant` as u128.
+
+(decl u128_from_constant (u128) Constant)
+(extern extractor u128_from_constant u128_from_constant)
+
+
 ;;;; Helpers for tail recursion loops ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; A range of integers to loop through.