[s390x, abi_impl] Add i128 support (#4598)

This adds full i128 support to the s390x target, including new filetests and enabling the existing i128 runtest on s390x. The ABI requires that i128 is passed and returned via implicit pointer, but the front end still generates direct i128 types in call. This means we have to implement ABI support to implicitly convert i128 types to pointers when passing arguments. To do so, we add a new variant ABIArg::ImplicitArg. This acts like StructArg, except that the value type is the actual target type, not a pointer type. The required conversions have to be inserted in the prologue and at function call sites. Note that when dereferencing the implicit pointer in the prologue, we may require a temp register: the pointer may be passed on the stack so it needs to be loaded first, but the value register may be in the wrong class for pointer values. In this case, we use the "stack limit" register, which should be available at this point in the prologue. For return values, we use a mechanism similar to the one used for supporting multiple return values in the Wasmtime ABI. The only difference is that the hidden pointer to the return buffer must be the *first*, not last, argument in this case. (This implements the second half of issue #4565.)
2022-08-04 22:41:26 +02:00
parent dc8362ceec
commit b17b1eb25d
46 changed files with 2424 additions and 166 deletions
--- a/cranelift/codegen/src/isa/s390x/inst/emit.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs
@@ -2548,10 +2548,12 @@ impl MachInstEmit for Inst {
                    VecBinaryOp::Add16x8 => (0xe7f3, 1),       // VAH
                    VecBinaryOp::Add32x4 => (0xe7f3, 2),       // VAF
                    VecBinaryOp::Add64x2 => (0xe7f3, 3),       // VAG
+                    VecBinaryOp::Add128 => (0xe7f3, 4),        // VAQ
                    VecBinaryOp::Sub8x16 => (0xe7f7, 0),       // VSB
                    VecBinaryOp::Sub16x8 => (0xe7f7, 1),       // VSH
                    VecBinaryOp::Sub32x4 => (0xe7f7, 2),       // VSF
                    VecBinaryOp::Sub64x2 => (0xe7f7, 3),       // VSG
+                    VecBinaryOp::Sub128 => (0xe7f7, 4),        // VSQ
                    VecBinaryOp::Mul8x16 => (0xe7a2, 0),       // VMLB
                    VecBinaryOp::Mul16x8 => (0xe7a2, 1),       // VMLHW
                    VecBinaryOp::Mul32x4 => (0xe7a2, 2),       // VMLF
@@ -2650,6 +2652,14 @@ impl MachInstEmit for Inst {
                    VecUnaryOp::Popcnt16x8 => (0xe750, 1),      // VPOPCTH
                    VecUnaryOp::Popcnt32x4 => (0xe750, 2),      // VPOPCTF
                    VecUnaryOp::Popcnt64x2 => (0xe750, 3),      // VPOPCTG
+                    VecUnaryOp::Clz8x16 => (0xe753, 0),         // VCLZB
+                    VecUnaryOp::Clz16x8 => (0xe753, 1),         // VCLZH
+                    VecUnaryOp::Clz32x4 => (0xe753, 2),         // VCLZF
+                    VecUnaryOp::Clz64x2 => (0xe753, 3),         // VCLZG
+                    VecUnaryOp::Ctz8x16 => (0xe752, 0),         // VCTZB
+                    VecUnaryOp::Ctz16x8 => (0xe752, 1),         // VCTZH
+                    VecUnaryOp::Ctz32x4 => (0xe752, 2),         // VCTZF
+                    VecUnaryOp::Ctz64x2 => (0xe752, 3),         // VCTZG
                    VecUnaryOp::UnpackULow8x16 => (0xe7d4, 0),  // VUPLLB
                    VecUnaryOp::UnpackULow16x8 => (0xe7d4, 1),  // VUPLLH
                    VecUnaryOp::UnpackULow32x4 => (0xe7d4, 2),  // VUPLLF
@@ -2781,6 +2791,45 @@ impl MachInstEmit for Inst {

                put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, m6));
            }
+            &Inst::VecInt128SCmpHi { tmp, rn, rm } | &Inst::VecInt128UCmpHi { tmp, rn, rm } => {
+                // Synthetic instruction to compare 128-bit values.
+                // Sets CC 1 if rn > rm, sets a different CC otherwise.
+                let tmp = allocs.next_writable(tmp);
+                let rn = allocs.next(rn);
+                let rm = allocs.next(rm);
+
+                // Use VECTOR ELEMENT COMPARE to compare the high parts.
+                // Swap the inputs to get:
+                //    CC 1 if high(rn) > high(rm)
+                //    CC 2 if high(rn) < high(rm)
+                //    CC 0 if high(rn) == high(rm)
+                let (opcode, m3) = match self {
+                    &Inst::VecInt128SCmpHi { .. } => (0xe7db, 3), // VECG
+                    &Inst::VecInt128UCmpHi { .. } => (0xe7d9, 3), // VECLG
+                    _ => unreachable!(),
+                };
+                put(sink, &enc_vrr_a(opcode, rm, rn, m3, 0, 0));
+
+                // If CC != 0, we'd done, so jump over the next instruction.
+                let opcode = 0xa74; // BCR
+                put(sink, &enc_ri_c(opcode, 7, 4 + 6));
+
+                // Otherwise, use VECTOR COMPARE HIGH LOGICAL.
+                // Since we already know the high parts are equal, the CC
+                // result will only depend on the low parts:
+                //     CC 1 if low(rn) > low(rm)
+                //     CC 3 if low(rn) <= low(rm)
+                let inst = Inst::VecIntCmpS {
+                    op: VecIntCmpOp::UCmpHi64x2,
+                    // N.B.: This is the first write to tmp, and it happens
+                    // after all uses of rn and rm.  If this were to ever
+                    // change, tmp would have to become an early-def.
+                    rd: tmp,
+                    rn,
+                    rm,
+                };
+                inst.emit(&[], sink, emit_info, state);
+            }

            &Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => {
                let rd = allocs.next_writable(rd);
--- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
@@ -8170,6 +8170,16 @@ fn test_s390x_binemit() {
        "E748C00038F3",
        "vag %v20, %v8, %v12",
    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::Add128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048F3",
+        "vaq %v20, %v8, %v12",
+    ));
    insns.push((
        Inst::VecRRR {
            op: VecBinaryOp::Sub8x16,
@@ -8210,6 +8220,16 @@ fn test_s390x_binemit() {
        "E748C00038F7",
        "vsg %v20, %v8, %v12",
    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::Sub128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048F7",
+        "vsq %v20, %v8, %v12",
+    ));
    insns.push((
        Inst::VecRRR {
            op: VecBinaryOp::Mul8x16,
@@ -9089,6 +9109,78 @@ fn test_s390x_binemit() {
        "E74800003850",
        "vpopctg %v20, %v8",
    ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Clz8x16,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800000853",
+        "vclzb %v20, %v8",
+    ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Clz16x8,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800001853",
+        "vclzh %v20, %v8",
+    ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Clz32x4,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800002853",
+        "vclzf %v20, %v8",
+    ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Clz64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800003853",
+        "vclzg %v20, %v8",
+    ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Ctz8x16,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800000852",
+        "vctzb %v20, %v8",
+    ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Ctz16x8,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800001852",
+        "vctzh %v20, %v8",
+    ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Ctz32x4,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800002852",
+        "vctzf %v20, %v8",
+    ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Ctz64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800003852",
+        "vctzg %v20, %v8",
+    ));
    insns.push((
        Inst::VecRR {
            op: VecUnaryOp::UnpackULow8x16,
@@ -9780,6 +9872,24 @@ fn test_s390x_binemit() {
        "E748C01038F9",
        "vchlgs %v20, %v8, %v12",
    ));
+    insns.push((
+        Inst::VecInt128SCmpHi {
+            tmp: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E7C8000030DBA7740005E748C01038F9",
+        "vecg %v12, %v8 ; jne 10 ; vchlgs %v20, %v8, %v12",
+    ));
+    insns.push((
+        Inst::VecInt128UCmpHi {
+            tmp: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E7C8000030D9A7740005E748C01038F9",
+        "veclg %v12, %v8 ; jne 10 ; vchlgs %v20, %v8, %v12",
+    ));

    insns.push((
        Inst::VecFloatCmp {
--- a/cranelift/codegen/src/isa/s390x/inst/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs
@@ -180,6 +180,8 @@ impl Inst {
            | Inst::VecIntCmpS { .. }
            | Inst::VecFloatCmp { .. }
            | Inst::VecFloatCmpS { .. }
+            | Inst::VecInt128SCmpHi { .. }
+            | Inst::VecInt128UCmpHi { .. }
            | Inst::VecLoad { .. }
            | Inst::VecStore { .. }
            | Inst::VecLoadReplicate { .. }
@@ -394,6 +396,7 @@ impl Inst {
                lane_imm: 0,
            },
            _ if ty.is_vector() && ty.bits() == 128 => Inst::VecLoad { rd: into_reg, mem },
+            types::B128 | types::I128 => Inst::VecLoad { rd: into_reg, mem },
            _ => unimplemented!("gen_load({})", ty),
        }
    }
@@ -418,6 +421,7 @@ impl Inst {
                lane_imm: 0,
            },
            _ if ty.is_vector() && ty.bits() == 128 => Inst::VecStore { rd: from_reg, mem },
+            types::B128 | types::I128 => Inst::VecStore { rd: from_reg, mem },
            _ => unimplemented!("gen_store({})", ty),
        }
    }
@@ -736,6 +740,11 @@ fn s390x_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandC
            collector.reg_use(rn);
            collector.reg_use(rm);
        }
+        &Inst::VecInt128SCmpHi { tmp, rn, rm, .. } | &Inst::VecInt128UCmpHi { tmp, rn, rm, .. } => {
+            collector.reg_def(tmp);
+            collector.reg_use(rn);
+            collector.reg_use(rm);
+        }
        &Inst::VecLoad { rd, ref mem, .. } => {
            collector.reg_def(rd);
            memarg_operands(mem, collector);
@@ -979,6 +988,11 @@ impl MachInst for Inst {
            .only_reg()
            .expect("multi-reg values not supported yet");
        match ty {
+            types::I128 | types::B128 => {
+                let mut ret = SmallVec::new();
+                ret.push(Inst::load_vec_constant(to_reg, value));
+                ret
+            }
            _ if ty.is_vector() && ty.bits() == 128 => {
                let mut ret = SmallVec::new();
                ret.push(Inst::load_vec_constant(to_reg, value));
@@ -1037,8 +1051,8 @@ impl MachInst for Inst {
            types::R64 => Ok((&[RegClass::Int], &[types::R64])),
            types::F32 => Ok((&[RegClass::Float], &[types::F32])),
            types::F64 => Ok((&[RegClass::Float], &[types::F64])),
-            types::I128 => Ok((&[RegClass::Int, RegClass::Int], &[types::I64, types::I64])),
-            types::B128 => Ok((&[RegClass::Int, RegClass::Int], &[types::B64, types::B64])),
+            types::I128 => Ok((&[RegClass::Float], &[types::I128])),
+            types::B128 => Ok((&[RegClass::Float], &[types::B128])),
            _ if ty.is_vector() && ty.bits() == 128 => Ok((&[RegClass::Float], &[types::I8X16])),
            // FIXME: We don't really have IFLAGS, but need to allow it here
            // for now to support the SelectifSpectreGuard instruction.
@@ -2202,10 +2216,12 @@ impl Inst {
                    VecBinaryOp::Add16x8 => "vah",
                    VecBinaryOp::Add32x4 => "vaf",
                    VecBinaryOp::Add64x2 => "vag",
+                    VecBinaryOp::Add128 => "vaq",
                    VecBinaryOp::Sub8x16 => "vsb",
                    VecBinaryOp::Sub16x8 => "vsh",
                    VecBinaryOp::Sub32x4 => "vsf",
                    VecBinaryOp::Sub64x2 => "vsg",
+                    VecBinaryOp::Sub128 => "vsq",
                    VecBinaryOp::Mul8x16 => "vmlb",
                    VecBinaryOp::Mul16x8 => "vmlhw",
                    VecBinaryOp::Mul32x4 => "vmlf",
@@ -2303,6 +2319,14 @@ impl Inst {
                    VecUnaryOp::Popcnt16x8 => "vpopcth",
                    VecUnaryOp::Popcnt32x4 => "vpopctf",
                    VecUnaryOp::Popcnt64x2 => "vpopctg",
+                    VecUnaryOp::Clz8x16 => "vclzb",
+                    VecUnaryOp::Clz16x8 => "vclzh",
+                    VecUnaryOp::Clz32x4 => "vclzf",
+                    VecUnaryOp::Clz64x2 => "vclzg",
+                    VecUnaryOp::Ctz8x16 => "vctzb",
+                    VecUnaryOp::Ctz16x8 => "vctzh",
+                    VecUnaryOp::Ctz32x4 => "vctzf",
+                    VecUnaryOp::Ctz64x2 => "vctzg",
                    VecUnaryOp::UnpackULow8x16 => "vupllb",
                    VecUnaryOp::UnpackULow16x8 => "vupllh",
                    VecUnaryOp::UnpackULow32x4 => "vupllf",
@@ -2425,6 +2449,20 @@ impl Inst {
                let rm = pretty_print_reg(rm, allocs);
                format!("{}{} {}, {}, {}", op, s, rd, rn, rm)
            }
+            &Inst::VecInt128SCmpHi { tmp, rn, rm } | &Inst::VecInt128UCmpHi { tmp, rn, rm } => {
+                let op = match self {
+                    &Inst::VecInt128SCmpHi { .. } => "vecg",
+                    &Inst::VecInt128UCmpHi { .. } => "veclg",
+                    _ => unreachable!(),
+                };
+                let tmp = pretty_print_reg(tmp.to_reg(), allocs);
+                let rn = pretty_print_reg(rn, allocs);
+                let rm = pretty_print_reg(rm, allocs);
+                format!(
+                    "{} {}, {} ; jne 10 ; vchlgs {}, {}, {}",
+                    op, rm, rn, tmp, rn, rm
+                )
+            }
            &Inst::VecLoad { rd, ref mem } | &Inst::VecLoadRev { rd, ref mem } => {
                let opcode = match self {
                    &Inst::VecLoad { .. } => "vl",