diff --git a/build.rs b/build.rs index c331648114..f658c3a6da 100644 --- a/build.rs +++ b/build.rs @@ -189,8 +189,10 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("simd", "simd_f64x2_cmp") => return false, ("simd", "simd_i8x16_arith") => return false, ("simd", "simd_i8x16_cmp") => return false, + ("simd", "simd_i8x16_sat_arith") => return false, ("simd", "simd_i16x8_arith") => return false, ("simd", "simd_i16x8_cmp") => return false, + ("simd", "simd_i16x8_sat_arith") => return false, ("simd", "simd_i32x4_arith") => return false, ("simd", "simd_i32x4_cmp") => return false, ("simd", "simd_load_extend") => return false, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index e872acd18c..7bbbdbdc6e 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1311,18 +1311,22 @@ impl MachInstEmit for Inst { debug_assert_eq!(I64, ty); (0b010_11110_11_1, 0b000011) } + VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011), VecALUOp::SQSubScalar => { debug_assert_eq!(I64, ty); (0b010_11110_11_1, 0b001011) } + VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011), VecALUOp::UQAddScalar => { debug_assert_eq!(I64, ty); (0b011_11110_11_1, 0b000011) } + VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011), VecALUOp::UQSubScalar => { debug_assert_eq!(I64, ty); (0b011_11110_11_1, 0b001011) } + VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011), VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011), VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111), VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101), diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 8f25043e6f..9b00bb6304 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2049,6 +2049,198 @@ fn test_aarch64_binemit() { "sqsub d21, d22, d23", )); + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqadd, + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(8), + ty: I8X16, + }, + "410C284E", + "sqadd v1.16b, v2.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqadd, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(28), + ty: I16X8, + }, + "810D7C4E", + "sqadd v1.8h, v12.8h, v28.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqadd, + rd: writable_vreg(12), + rn: vreg(2), + rm: vreg(6), + ty: I32X4, + }, + "4C0CA64E", + "sqadd v12.4s, v2.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqadd, + rd: writable_vreg(20), + rn: vreg(7), + rm: vreg(13), + ty: I64X2, + }, + "F40CED4E", + "sqadd v20.2d, v7.2d, v13.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqsub, + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(8), + ty: I8X16, + }, + "412C284E", + "sqsub v1.16b, v2.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqsub, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(28), + ty: I16X8, + }, + "812D7C4E", + "sqsub v1.8h, v12.8h, v28.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqsub, + rd: writable_vreg(12), + rn: vreg(2), + rm: vreg(6), + ty: I32X4, + }, + "4C2CA64E", + "sqsub v12.4s, v2.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Sqsub, + rd: writable_vreg(20), + rn: vreg(7), + rm: vreg(13), + ty: I64X2, + }, + "F42CED4E", + "sqsub v20.2d, v7.2d, v13.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqadd, + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(8), + ty: I8X16, + }, + "410C286E", + "uqadd v1.16b, v2.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqadd, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(28), + ty: I16X8, + }, + "810D7C6E", + "uqadd v1.8h, v12.8h, v28.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqadd, + rd: writable_vreg(12), + rn: vreg(2), + rm: vreg(6), + ty: I32X4, + }, + "4C0CA66E", + "uqadd v12.4s, v2.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqadd, + rd: writable_vreg(20), + rn: vreg(7), + rm: vreg(13), + ty: I64X2, + }, + "F40CED6E", + "uqadd v20.2d, v7.2d, v13.2d", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqsub, + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(8), + ty: I8X16, + }, + "412C286E", + "uqsub v1.16b, v2.16b, v8.16b", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqsub, + rd: writable_vreg(1), + rn: vreg(12), + rm: vreg(28), + ty: I16X8, + }, + "812D7C6E", + "uqsub v1.8h, v12.8h, v28.8h", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqsub, + rd: writable_vreg(12), + rn: vreg(2), + rm: vreg(6), + ty: I32X4, + }, + "4C2CA66E", + "uqsub v12.4s, v2.4s, v6.4s", + )); + + insns.push(( + Inst::VecRRR { + alu_op: VecALUOp::Uqsub, + rd: writable_vreg(20), + rn: vreg(7), + rm: vreg(13), + ty: I64X2, + }, + "F42CED6E", + "uqsub v20.2d, v7.2d, v13.2d", + )); + insns.push(( Inst::VecRRR { alu_op: VecALUOp::Cmeq, diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 3f1f849336..1637fe7dae 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -209,12 +209,16 @@ pub enum VecExtendOp { pub enum VecALUOp { /// Signed saturating add SQAddScalar, + Sqadd, /// Unsigned saturating add UQAddScalar, + Uqadd, /// Signed saturating subtract SQSubScalar, + Sqsub, /// Unsigned saturating subtract UQSubScalar, + Uqsub, /// Compare bitwise equal Cmeq, /// Compare signed greater than or equal @@ -2734,9 +2738,13 @@ impl ShowWithRRU for Inst { } => { let (op, vector, ty) = match alu_op { VecALUOp::SQAddScalar => ("sqadd", false, ty), + VecALUOp::Sqadd => ("sqadd", true, ty), VecALUOp::UQAddScalar => ("uqadd", false, ty), + VecALUOp::Uqadd => ("uqadd", true, ty), VecALUOp::SQSubScalar => ("sqsub", false, ty), + VecALUOp::Sqsub => ("sqsub", true, ty), VecALUOp::UQSubScalar => ("uqsub", false, ty), + VecALUOp::Uqsub => ("uqsub", true, ty), VecALUOp::Cmeq => ("cmeq", true, ty), VecALUOp::Cmge => ("cmge", true, ty), VecALUOp::Cmgt => ("cmgt", true, ty), diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 664f2729a3..0deb06c2b8 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -93,74 +93,64 @@ pub(crate) fn lower_insn_to_regs>( }); } } - Opcode::UaddSat | Opcode::SaddSat => { + Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => { // We use the vector instruction set's saturating adds (UQADD / // SQADD), which require vector registers. - let is_signed = op == Opcode::SaddSat; - let narrow_mode = if is_signed { - NarrowValueMode::SignExtend64 - } else { - NarrowValueMode::ZeroExtend64 - }; - let alu_op = if is_signed { - VecALUOp::SQAddScalar - } else { - VecALUOp::UQAddScalar - }; - let va = ctx.alloc_tmp(RegClass::V128, I128); - let vb = ctx.alloc_tmp(RegClass::V128, I128); - let ra = put_input_in_reg(ctx, inputs[0], narrow_mode); - let rb = put_input_in_reg(ctx, inputs[1], narrow_mode); + let is_signed = op == Opcode::SaddSat || op == Opcode::SsubSat; + let ty = ty.unwrap(); let rd = get_output_reg(ctx, outputs[0]); - ctx.emit(Inst::MovToVec64 { rd: va, rn: ra }); - ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb }); - ctx.emit(Inst::VecRRR { - rd: va, - rn: va.to_reg(), - rm: vb.to_reg(), - alu_op, - ty: I64, - }); - ctx.emit(Inst::MovFromVec { - rd, - rn: va.to_reg(), - idx: 0, - ty: I64, - }); - } + if ty_bits(ty) < 128 { + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend64 + } else { + NarrowValueMode::ZeroExtend64 + }; + let alu_op = match op { + Opcode::UaddSat => VecALUOp::UQAddScalar, + Opcode::SaddSat => VecALUOp::SQAddScalar, + Opcode::UsubSat => VecALUOp::UQSubScalar, + Opcode::SsubSat => VecALUOp::SQSubScalar, + _ => unreachable!(), + }; + let va = ctx.alloc_tmp(RegClass::V128, I128); + let vb = ctx.alloc_tmp(RegClass::V128, I128); + let ra = put_input_in_reg(ctx, inputs[0], narrow_mode); + let rb = put_input_in_reg(ctx, inputs[1], narrow_mode); + ctx.emit(Inst::MovToVec64 { rd: va, rn: ra }); + ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb }); + ctx.emit(Inst::VecRRR { + rd: va, + rn: va.to_reg(), + rm: vb.to_reg(), + alu_op, + ty: I64, + }); + ctx.emit(Inst::MovFromVec { + rd, + rn: va.to_reg(), + idx: 0, + ty: I64, + }); + } else { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - Opcode::UsubSat | Opcode::SsubSat => { - let is_signed = op == Opcode::SsubSat; - let narrow_mode = if is_signed { - NarrowValueMode::SignExtend64 - } else { - NarrowValueMode::ZeroExtend64 - }; - let alu_op = if is_signed { - VecALUOp::SQSubScalar - } else { - VecALUOp::UQSubScalar - }; - let va = ctx.alloc_tmp(RegClass::V128, I128); - let vb = ctx.alloc_tmp(RegClass::V128, I128); - let ra = put_input_in_reg(ctx, inputs[0], narrow_mode); - let rb = put_input_in_reg(ctx, inputs[1], narrow_mode); - let rd = get_output_reg(ctx, outputs[0]); - ctx.emit(Inst::MovToVec64 { rd: va, rn: ra }); - ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb }); - ctx.emit(Inst::VecRRR { - rd: va, - rn: va.to_reg(), - rm: vb.to_reg(), - alu_op, - ty: I64, - }); - ctx.emit(Inst::MovFromVec { - rd, - rn: va.to_reg(), - idx: 0, - ty: I64, - }); + let alu_op = match op { + Opcode::UaddSat => VecALUOp::Uqadd, + Opcode::SaddSat => VecALUOp::Sqadd, + Opcode::UsubSat => VecALUOp::Uqsub, + Opcode::SsubSat => VecALUOp::Sqsub, + _ => unreachable!(), + }; + + ctx.emit(Inst::VecRRR { + rd, + rn, + rm, + alu_op, + ty, + }); + } } Opcode::Ineg => {