x64: Lower tlsvalue, sqmul_round_sat, and uunarrow in ISLE (#4793)

Lower tlsvalue, sqmul_round_sat, and uunarrow in ISLE.
2022-08-26 16:33:48 -07:00
parent 8e8dfdf5f9
commit 25d960f9c4
11 changed files with 287 additions and 205 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -486,16 +486,19 @@
       (XmmUninitializedValue (dst WritableXmm))

       ;; A call to the `ElfTlsGetAddr` libcall. Returns address of TLS symbol
-       ;; in `rax`.
-       (ElfTlsGetAddr (symbol ExternalName))
+       ;; `dst`, which is constrained to `rax`.
+       (ElfTlsGetAddr (symbol ExternalName)
+                      (dst WritableGpr))

       ;; A Mach-O TLS symbol access. Returns address of the TLS symbol in
-       ;; `rax`.
-       (MachOTlsGetAddr (symbol ExternalName))
+       ;; `dst`, which is constrained to `rax`.
+       (MachOTlsGetAddr (symbol ExternalName)
+                        (dst WritableGpr))

       ;; A Coff TLS symbol access. Returns address of the TLS symbol in
-       ;; `rax`.
-       (CoffTlsGetAddr (symbol ExternalName))
+       ;; `dst`, which is constrained to `rax`.
+       (CoffTlsGetAddr (symbol ExternalName)
+                       (dst WritableGpr))

       ;; An unwind pseudoinstruction describing the state of the machine at
       ;; this program point.
@@ -2275,6 +2278,11 @@
 (rule (x64_pmulhw src1 src2)
      (xmm_rm_r $I16X8 (SseOpcode.Pmulhw) src1 src2))

+;; Helper for creating `pmulhrsw` instructions.
+(decl x64_pmulhrsw (Xmm XmmMem) Xmm)
+(rule (x64_pmulhrsw src1 src2)
+      (xmm_rm_r $I16X8 (SseOpcode.Pmulhrsw) src1 src2))
+
 ;; Helper for creating `pmulhuw` instructions.
 (decl x64_pmulhuw (Xmm XmmMem) Xmm)
 (rule (x64_pmulhuw src1 src2)
@@ -2683,6 +2691,15 @@
                                        dst))))
        dst))

+;; Helper for creating `shufps` instructions.
+(decl x64_shufps (Xmm XmmMem u8) Xmm)
+(rule (x64_shufps src1 src2 byte)
+      (xmm_rm_r_imm (SseOpcode.Shufps)
+                    src1
+                    src2
+                    byte
+                    (OperandSize.Size32)))
+
 ;; Helper for creating `MInst.XmmUnaryRmR` instructions.
 (decl xmm_unary_rm_r (SseOpcode XmmMem) Xmm)
 (rule (xmm_unary_rm_r op src)
@@ -3733,6 +3750,42 @@
 (decl swizzle_zero_mask () VCodeConstant)
 (extern constructor swizzle_zero_mask swizzle_zero_mask)

+;;;; TLS Values ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Helper for emitting ElfTlsGetAddr.
+(decl elf_tls_get_addr (ExternalName) Gpr)
+(rule (elf_tls_get_addr name)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.ElfTlsGetAddr name dst))))
+        dst))
+
+;; Helper for emitting MachOTlsGetAddr.
+(decl macho_tls_get_addr (ExternalName) Gpr)
+(rule (macho_tls_get_addr name)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.MachOTlsGetAddr name dst))))
+        dst))
+
+;; Helper for emitting CoffTlsGetAddr.
+(decl coff_tls_get_addr (ExternalName) Gpr)
+(rule (coff_tls_get_addr name)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.CoffTlsGetAddr name dst))))
+        dst))
+
+;;;; sqmul_round_sat ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(decl sqmul_round_sat_mask () VCodeConstant)
+(extern constructor sqmul_round_sat_mask sqmul_round_sat_mask)
+
+;;;; uunarrow ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(decl uunarrow_umax_mask () VCodeConstant)
+(extern constructor uunarrow_umax_mask uunarrow_umax_mask)
+
+(decl uunarrow_uint_mask () VCodeConstant)
+(extern constructor uunarrow_uint_mask uunarrow_uint_mask)
+
 ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (convert Gpr InstOutput output_gpr)
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2915,7 +2915,10 @@ pub(crate) fn emit(
            }
        }

-        Inst::ElfTlsGetAddr { ref symbol } => {
+        Inst::ElfTlsGetAddr { ref symbol, dst } => {
+            let dst = allocs.next(dst.to_reg().to_reg());
+            debug_assert_eq!(dst, regs::rax());
+
            // N.B.: Must be exactly this byte sequence; the linker requires it,
            // because it must know how to rewrite the bytes.

@@ -2941,7 +2944,10 @@ pub(crate) fn emit(
            sink.put4(0); // offset
        }

-        Inst::MachOTlsGetAddr { ref symbol } => {
+        Inst::MachOTlsGetAddr { ref symbol, dst } => {
+            let dst = allocs.next(dst.to_reg().to_reg());
+            debug_assert_eq!(dst, regs::rax());
+
            // movq gv@tlv(%rip), %rdi
            sink.put1(0x48); // REX.w
            sink.put1(0x8b); // MOV
@@ -2954,7 +2960,10 @@ pub(crate) fn emit(
            sink.put1(0x17);
        }

-        Inst::CoffTlsGetAddr { ref symbol } => {
+        Inst::CoffTlsGetAddr { ref symbol, dst } => {
+            let dst = allocs.next(dst.to_reg().to_reg());
+            debug_assert_eq!(dst, regs::rax());
+
            // See: https://gcc.godbolt.org/z/M8or9x6ss
            // And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-532930282

--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -95,6 +95,24 @@ impl Inst {
        let dst = WritableGpr::from_writable_reg(dst).unwrap();
        Inst::Setcc { cc, dst }
    }
+
+    fn xmm_rm_r_imm(
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        imm: u8,
+        size: OperandSize,
+    ) -> Inst {
+        debug_assert!(size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
+        Inst::XmmRmRImm {
+            op,
+            src1: dst.to_reg(),
+            src2: src,
+            dst,
+            imm,
+            size,
+        }
+    }
 }

 #[test]
@@ -4738,6 +4756,7 @@ fn test_x64_emit() {
    insns.push((
        Inst::ElfTlsGetAddr {
            symbol: ExternalName::User(UserExternalNameRef::new(0)),
+            dst: WritableGpr::from_writable_reg(w_rax).unwrap(),
        },
        "66488D3D00000000666648E800000000",
        "%rax = elf_tls_get_addr User(userextname0)",
@@ -4746,6 +4765,7 @@ fn test_x64_emit() {
    insns.push((
        Inst::MachOTlsGetAddr {
            symbol: ExternalName::User(UserExternalNameRef::new(0)),
+            dst: WritableGpr::from_writable_reg(w_rax).unwrap(),
        },
        "488B3D00000000FF17",
        "%rax = macho_tls_get_addr User(userextname0)",
@@ -4754,6 +4774,7 @@ fn test_x64_emit() {
    insns.push((
        Inst::CoffTlsGetAddr {
            symbol: ExternalName::User(UserExternalNameRef::new(0)),
+            dst: WritableGpr::from_writable_reg(w_rax).unwrap(),
        },
        "8B050000000065488B0C2558000000488B04C1488D8000000000",
        "%rax = coff_tls_get_addr User(userextname0)",
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -263,12 +263,6 @@ impl Inst {
        Inst::MovRR { size, src, dst }
    }

-    pub(crate) fn xmm_load_const(src: VCodeConstant, dst: Writable<Reg>, ty: Type) -> Inst {
-        debug_assert!(dst.to_reg().class() == RegClass::Float);
-        debug_assert!(ty.is_vector() && ty.bits() == 128);
-        Inst::XmmLoadConst { src, dst, ty }
-    }
-
    /// Convenient helper for unary float operations.
    pub(crate) fn xmm_unary_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
        src.assert_regclass_is(RegClass::Float);
@@ -377,24 +371,6 @@ impl Inst {
        }
    }

-    pub(crate) fn xmm_rm_r_imm(
-        op: SseOpcode,
-        src: RegMem,
-        dst: Writable<Reg>,
-        imm: u8,
-        size: OperandSize,
-    ) -> Inst {
-        debug_assert!(size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
-        Inst::XmmRmRImm {
-            op,
-            src1: dst.to_reg(),
-            src2: src,
-            dst,
-            imm,
-            size,
-        }
-    }
-
    pub(crate) fn movzx_rm_r(ext_mode: ExtMode, src: RegMem, dst: Writable<Reg>) -> Inst {
        src.assert_regclass_is(RegClass::Int);
        debug_assert!(dst.to_reg().class() == RegClass::Int);
@@ -1544,16 +1520,19 @@ impl PrettyPrint for Inst {

            Inst::Ud2 { trap_code } => format!("ud2 {}", trap_code),

-            Inst::ElfTlsGetAddr { ref symbol } => {
-                format!("%rax = elf_tls_get_addr {:?}", symbol)
+            Inst::ElfTlsGetAddr { ref symbol, dst } => {
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
+                format!("{} = elf_tls_get_addr {:?}", dst, symbol)
            }

-            Inst::MachOTlsGetAddr { ref symbol } => {
-                format!("%rax = macho_tls_get_addr {:?}", symbol)
+            Inst::MachOTlsGetAddr { ref symbol, dst } => {
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
+                format!("{} = macho_tls_get_addr {:?}", dst, symbol)
            }

-            Inst::CoffTlsGetAddr { ref symbol } => {
-                format!("%rax = coff_tls_get_addr {:?}", symbol)
+            Inst::CoffTlsGetAddr { ref symbol, dst } => {
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
+                format!("{} = coff_tls_get_addr {:?}", dst, symbol)
            }

            Inst::Unwind { inst } => {
@@ -1994,8 +1973,8 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            // No registers are used.
        }

-        Inst::ElfTlsGetAddr { .. } | Inst::MachOTlsGetAddr { .. } => {
-            collector.reg_def(Writable::from_reg(regs::rax()));
+        Inst::ElfTlsGetAddr { dst, .. } | Inst::MachOTlsGetAddr { dst, .. } => {
+            collector.reg_fixed_def(dst.to_writable_reg(), regs::rax());
            // All caller-saves are clobbered.
            //
            // We use the SysV calling convention here because the
@@ -2007,12 +1986,12 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            collector.reg_clobbers(clobbers);
        }

-        Inst::CoffTlsGetAddr { .. } => {
+        Inst::CoffTlsGetAddr { dst, .. } => {
            // We also use the gs register. But that register is not allocatable by the
            // register allocator, so we don't need to mark it as used here.

            // We use %rax to set the address
-            collector.reg_def(Writable::from_reg(regs::rax()));
+            collector.reg_fixed_def(dst.to_writable_reg(), regs::rax());

            // We use %rcx as a temporary variable to load the _tls_index
            collector.reg_def(Writable::from_reg(regs::rcx()));
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3694,3 +3694,66 @@
            (lo Reg (value_regs_get regs 0))
            (hi Reg (value_regs_get regs 1)))
        (output_pair lo hi)))
+
+;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (tls_value (symbol_value_data name _ _)))
+      (if (tls_model_is_elf_gd))
+      (elf_tls_get_addr name))
+
+(rule (lower (tls_value (symbol_value_data name _ _)))
+      (if (tls_model_is_macho))
+      (macho_tls_get_addr name))
+
+(rule (lower (tls_value (symbol_value_data name _ _)))
+      (if (tls_model_is_coff))
+      (coff_tls_get_addr name))
+
+;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (sqmul_round_sat qx @ (value_type $I16X8) qy))
+      (let ((src1 Xmm qx)
+            (src2 Xmm qy)
+
+            (mask Xmm (x64_xmm_load_const $I16X8 (sqmul_round_sat_mask)))
+            (dst Xmm (x64_pmulhrsw src1 src2))
+            (cmp Xmm (x64_pcmpeqw mask dst)))
+        (x64_pxor dst cmp)))
+
+;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; TODO: currently we only lower a special case of `uunarrow` needed to support
+;; the translation of wasm's i32x4.trunc_sat_f64x2_u_zero operation.
+;; https://github.com/bytecodealliance/wasmtime/issues/4791
+;;
+;; y = i32x4.trunc_sat_f64x2_u_zero(x) is lowered to:
+;; MOVAPD xmm_y, xmm_x
+;; XORPD xmm_tmp, xmm_tmp
+;; MAXPD xmm_y, xmm_tmp
+;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
+;; ROUNDPD xmm_y, xmm_y, 0x0B
+;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
+;; SHUFPS xmm_y, xmm_xmp, 0x88
+(rule (lower (uunarrow (fcvt_to_uint_sat src @ (value_type $F64X2))
+                       (vconst (u128_from_constant 0))))
+      (let ((src Xmm src)
+
+            ;; MOVAPD xmm_y, xmm_x
+            ;; XORPD xmm_tmp, xmm_tmp
+            (zeros Xmm (x64_xorpd src src))
+            (dst Xmm (x64_maxpd src zeros))
+
+            (umax_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_umax_mask)))
+
+            ;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
+            (dst Xmm (x64_minpd dst umax_mask))
+
+            ;; ROUNDPD xmm_y, xmm_y, 0x0B
+            (dst Xmm (x64_roundpd dst (RoundImm.RoundZero)))
+
+            ;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
+            (uint_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_uint_mask)))
+            (dst Xmm (x64_addpd dst uint_mask)))
+
+        ;; SHUFPS xmm_y, xmm_xmp, 0x88
+        (x64_shufps dst zeros 0x88)))
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -11,7 +11,7 @@ use crate::isa::{x64::settings as x64_settings, x64::X64Backend, CallConv};
 use crate::machinst::lower::*;
 use crate::machinst::*;
 use crate::result::CodegenResult;
-use crate::settings::{Flags, TlsModel};
+use crate::settings::Flags;
 use smallvec::SmallVec;
 use target_lexicon::Triple;

@@ -304,33 +304,15 @@ fn lower_insn_to_regs(
    isa_flags: &x64_settings::Flags,
    triple: &Triple,
 ) -> CodegenResult<()> {
-    let op = ctx.data(insn).opcode();
-
-    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
-        .map(|i| InsnInput { insn, input: i })
-        .collect();
    let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
        .map(|i| InsnOutput { insn, output: i })
        .collect();

-    let ty = if outputs.len() > 0 {
-        Some(ctx.output_ty(insn, 0))
-    } else {
-        None
-    };
-
    if let Ok(()) = isle::lower(ctx, triple, flags, isa_flags, &outputs, insn) {
        return Ok(());
    }

-    let implemented_in_isle = |ctx: &mut Lower<Inst>| {
-        unreachable!(
-            "implemented in ISLE: inst = `{}`, type = `{:?}`",
-            ctx.dfg().display_inst(insn),
-            ty
-        )
-    };
-
+    let op = ctx.data(insn).opcode();
    match op {
        Opcode::Iconst
        | Opcode::Bconst
@@ -474,152 +456,25 @@ fn lower_insn_to_regs(
        | Opcode::VallTrue
        | Opcode::VhighBits
        | Opcode::Iconcat
-        | Opcode::Isplit => {
-            implemented_in_isle(ctx);
+        | Opcode::Isplit
+        | Opcode::TlsValue
+        | Opcode::SqmulRoundSat
+        | Opcode::Uunarrow => {
+            let ty = if outputs.len() > 0 {
+                Some(ctx.output_ty(insn, 0))
+            } else {
+                None
+            };
+
+            unreachable!(
+                "implemented in ISLE: inst = `{}`, type = `{:?}`",
+                ctx.dfg().display_inst(insn),
+                ty
+            )
        }

        Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),

-        Opcode::TlsValue => {
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let (name, _, _) = ctx.symbol_value(insn).unwrap();
-            let symbol = name.clone();
-
-            match flags.tls_model() {
-                TlsModel::ElfGd => {
-                    ctx.emit(Inst::ElfTlsGetAddr { symbol });
-                    ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
-                }
-                TlsModel::Macho => {
-                    ctx.emit(Inst::MachOTlsGetAddr { symbol });
-                    ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
-                }
-                TlsModel::Coff => {
-                    ctx.emit(Inst::CoffTlsGetAddr { symbol });
-                    ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
-                }
-                _ => todo!(
-                    "Unimplemented TLS model in x64 backend: {:?}",
-                    flags.tls_model()
-                ),
-            }
-        }
-
-        Opcode::SqmulRoundSat => {
-            // Lane-wise saturating rounding multiplication in Q15 format
-            // Optimal lowering taken from instruction proposal https://github.com/WebAssembly/simd/pull/365
-            // y = i16x8.q15mulr_sat_s(a, b) is lowered to:
-            //MOVDQA xmm_y, xmm_a
-            //MOVDQA xmm_tmp, wasm_i16x8_splat(0x8000)
-            //PMULHRSW xmm_y, xmm_b
-            //PCMPEQW xmm_tmp, xmm_y
-            //PXOR xmm_y, xmm_tmp
-            let input_ty = ctx.input_ty(insn, 0);
-            let src1 = put_input_in_reg(ctx, inputs[0]);
-            let src2 = put_input_in_reg(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            ctx.emit(Inst::gen_move(dst, src1, input_ty));
-            static SAT_MASK: [u8; 16] = [
-                0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
-                0x00, 0x80,
-            ];
-            let mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&SAT_MASK));
-            let mask = ctx.alloc_tmp(types::I16X8).only_reg().unwrap();
-            ctx.emit(Inst::xmm_load_const(mask_const, mask, types::I16X8));
-
-            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmulhrsw, RegMem::reg(src2), dst));
-            ctx.emit(Inst::xmm_rm_r(
-                SseOpcode::Pcmpeqw,
-                RegMem::reg(dst.to_reg()),
-                mask,
-            ));
-            ctx.emit(Inst::xmm_rm_r(
-                SseOpcode::Pxor,
-                RegMem::reg(mask.to_reg()),
-                dst,
-            ));
-        }
-
-        Opcode::Uunarrow => {
-            if let Some(fcvt_inst) = matches_input(ctx, inputs[0], Opcode::FcvtToUintSat) {
-                //y = i32x4.trunc_sat_f64x2_u_zero(x) is lowered to:
-                //MOVAPD xmm_y, xmm_x
-                //XORPD xmm_tmp, xmm_tmp
-                //MAXPD xmm_y, xmm_tmp
-                //MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
-                //ROUNDPD xmm_y, xmm_y, 0x0B
-                //ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
-                //SHUFPS xmm_y, xmm_xmp, 0x88
-
-                let fcvt_input = InsnInput {
-                    insn: fcvt_inst,
-                    input: 0,
-                };
-                let input_ty = ctx.input_ty(fcvt_inst, 0);
-                let output_ty = ctx.output_ty(insn, 0);
-                let src = put_input_in_reg(ctx, fcvt_input);
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-                ctx.emit(Inst::gen_move(dst, src, input_ty));
-                let tmp1 = ctx.alloc_tmp(output_ty).only_reg().unwrap();
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::from(tmp1), tmp1));
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxpd, RegMem::from(tmp1), dst));
-
-                // 4294967295.0 is equivalent to 0x41EFFFFFFFE00000
-                static UMAX_MASK: [u8; 16] = [
-                    0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF,
-                    0xFF, 0xEF, 0x41,
-                ];
-                let umax_const = ctx.use_constant(VCodeConstantData::WellKnown(&UMAX_MASK));
-                let umax_mask = ctx.alloc_tmp(types::F64X2).only_reg().unwrap();
-                ctx.emit(Inst::xmm_load_const(umax_const, umax_mask, types::F64X2));
-
-                //MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Minpd,
-                    RegMem::from(umax_mask),
-                    dst,
-                ));
-                //ROUNDPD xmm_y, xmm_y, 0x0B
-                ctx.emit(Inst::xmm_rm_r_imm(
-                    SseOpcode::Roundpd,
-                    RegMem::reg(dst.to_reg()),
-                    dst,
-                    RoundImm::RoundZero.encode(),
-                    OperandSize::Size32,
-                ));
-                //ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
-                static UINT_MASK: [u8; 16] = [
-                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
-                    0x00, 0x30, 0x43,
-                ];
-                let uint_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
-                let uint_mask = ctx.alloc_tmp(types::F64X2).only_reg().unwrap();
-                ctx.emit(Inst::xmm_load_const(
-                    uint_mask_const,
-                    uint_mask,
-                    types::F64X2,
-                ));
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Addpd,
-                    RegMem::from(uint_mask),
-                    dst,
-                ));
-
-                //SHUFPS xmm_y, xmm_xmp, 0x88
-                ctx.emit(Inst::xmm_rm_r_imm(
-                    SseOpcode::Shufps,
-                    RegMem::reg(tmp1.to_reg()),
-                    dst,
-                    0x88,
-                    OperandSize::Size32,
-                ));
-            } else {
-                println!("Did not match fcvt input!");
-            }
-        }
-
        // Unimplemented opcodes below. These are not currently used by Wasm
        // lowering or other known embeddings, but should be either supported or
        // removed eventually
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -911,6 +911,39 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
            .use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE))
    }

+    #[inline]
+    fn sqmul_round_sat_mask(&mut self) -> VCodeConstant {
+        static SAT_MASK: [u8; 16] = [
+            0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+            0x00, 0x80,
+        ];
+        self.lower_ctx
+            .use_constant(VCodeConstantData::WellKnown(&SAT_MASK))
+    }
+
+    #[inline]
+    fn uunarrow_umax_mask(&mut self) -> VCodeConstant {
+        // 4294967295.0 is equivalent to 0x41EFFFFFFFE00000
+        static UMAX_MASK: [u8; 16] = [
+            0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF,
+            0xEF, 0x41,
+        ];
+
+        self.lower_ctx
+            .use_constant(VCodeConstantData::WellKnown(&UMAX_MASK))
+    }
+
+    #[inline]
+    fn uunarrow_uint_mask(&mut self) -> VCodeConstant {
+        static UINT_MASK: [u8; 16] = [
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x30, 0x43,
+        ];
+
+        self.lower_ctx
+            .use_constant(VCodeConstantData::WellKnown(&UINT_MASK))
+    }
+
    fn emit_div_or_rem(
        &mut self,
        kind: &DivOrRemKind,
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -664,6 +664,24 @@ macro_rules! isle_prelude_methods {
            }
        }

+        #[inline]
+        fn tls_model_is_macho(&mut self) -> Option<()> {
+            if self.flags.tls_model() == TlsModel::Macho {
+                Some(())
+            } else {
+                None
+            }
+        }
+
+        #[inline]
+        fn tls_model_is_coff(&mut self) -> Option<()> {
+            if self.flags.tls_model() == TlsModel::Coff {
+                Some(())
+            } else {
+                None
+            }
+        }
+
        #[inline]
        fn func_ref_data(&mut self, func_ref: FuncRef) -> (SigRef, ExternalName, RelocDistance) {
            let funcdata = &self.lower_ctx.dfg().ext_funcs[func_ref];
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -790,6 +790,12 @@
 (decl pure tls_model_is_elf_gd () Unit)
 (extern constructor tls_model_is_elf_gd tls_model_is_elf_gd)

+(decl pure tls_model_is_macho () Unit)
+(extern constructor tls_model_is_macho tls_model_is_macho)
+
+(decl pure tls_model_is_coff () Unit)
+(extern constructor tls_model_is_coff tls_model_is_coff)
+
 ;;;; Helpers for accessing instruction data ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Accessor for `FuncRef`.
--- a/cranelift/filetests/filetests/isa/x64/sqmul_round_sat.clif
+++ b/cranelift/filetests/filetests/isa/x64/sqmul_round_sat.clif
@@ -0,0 +1,19 @@
+test compile precise-output
+target x86_64
+
+function %f1(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+  v2 = sqmul_round_sat v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   load_const VCodeConstant(0), %xmm7
+;   pmulhrsw %xmm0, %xmm1, %xmm0
+;   pcmpeqw %xmm7, %xmm0, %xmm7
+;   pxor    %xmm0, %xmm7, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
--- a/cranelift/filetests/filetests/isa/x64/uunarrow.clif
+++ b/cranelift/filetests/filetests/isa/x64/uunarrow.clif
@@ -0,0 +1,26 @@
+test compile precise-output
+target x86_64
+
+function %f1(f64x2) -> i32x4 {
+block0(v0: f64x2):
+  v1 = fcvt_to_uint_sat.i64x2 v0
+  v2 = vconst.i64x2 [0 0]
+  v3 = uunarrow v1, v2
+  return v3
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   xorpd   %xmm3, %xmm3, %xmm3
+;   maxpd   %xmm0, %xmm3, %xmm0
+;   load_const VCodeConstant(0), %xmm7
+;   minpd   %xmm0, %xmm7, %xmm0
+;   roundpd $3, %xmm0, %xmm0
+;   load_const VCodeConstant(1), %xmm13
+;   addpd   %xmm0, %xmm13, %xmm0
+;   shufps  $136, %xmm0, %xmm3, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+