diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index c2b4deda36..2e09e60a5c 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1273,6 +1273,8 @@ impl MachInstEmit for Inst { mov x28, x26 so that we simply write in the destination, the "2nd arg for op". */ + // TODO: We should not hardcode registers here, a better idea would be to + // pass some scratch registers in the AtomicRMW pseudo-instruction, and use those let xzr = zero_reg(); let x24 = xreg(24); let x25 = xreg(25); @@ -1294,25 +1296,90 @@ impl MachInstEmit for Inst { } sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25] - if op == inst_common::AtomicRmwOp::Xchg { - // mov x28, x26 - sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x28wr, xzr, x26)) - } else { - // add/sub/and/orr/eor x28, x27, x26 - let bits_31_21 = match op { - inst_common::AtomicRmwOp::Add => 0b100_01011_00_0, - inst_common::AtomicRmwOp::Sub => 0b110_01011_00_0, - inst_common::AtomicRmwOp::And => 0b100_01010_00_0, - inst_common::AtomicRmwOp::Or => 0b101_01010_00_0, - inst_common::AtomicRmwOp::Xor => 0b110_01010_00_0, - inst_common::AtomicRmwOp::Nand - | inst_common::AtomicRmwOp::Umin - | inst_common::AtomicRmwOp::Umax - | inst_common::AtomicRmwOp::Smin - | inst_common::AtomicRmwOp::Smax => todo!("{:?}", op), - inst_common::AtomicRmwOp::Xchg => unreachable!(), - }; - sink.put4(enc_arith_rrr(bits_31_21, 0b000000, x28wr, x27, x26)); + match op { + AtomicRmwOp::Xchg => { + // mov x28, x26 + Inst::Mov64 { rd: x28wr, rm: x26 }.emit(sink, emit_info, state); + } + AtomicRmwOp::Nand => { + // and x28, x27, x26 + // mvn x28, x28 + + Inst::AluRRR { + alu_op: ALUOp::And64, + rd: x28wr, + rn: x27, + rm: x26, + } + .emit(sink, emit_info, state); + + Inst::AluRRR { + alu_op: ALUOp::OrrNot64, + rd: x28wr, + rn: xzr, + rm: x28, + } + .emit(sink, emit_info, state); + } + AtomicRmwOp::Umin + | AtomicRmwOp::Umax + | AtomicRmwOp::Smin + | AtomicRmwOp::Smax => { + // cmp x27, x26 + // csel.op x28, x27, x26 + + let cond = match op { + AtomicRmwOp::Umin => Cond::Lo, + AtomicRmwOp::Umax => Cond::Hi, + AtomicRmwOp::Smin => Cond::Lt, + AtomicRmwOp::Smax => Cond::Gt, + _ => unreachable!(), + }; + + Inst::AluRRR { + alu_op: if ty == I64 { + ALUOp::SubS64 + } else { + ALUOp::SubS32 + }, + rd: writable_zero_reg(), + rn: x27, + rm: x26, + } + .emit(sink, emit_info, state); + + Inst::CSel { + cond, + rd: x28wr, + rn: x27, + rm: x26, + } + .emit(sink, emit_info, state); + } + _ => { + // add/sub/and/orr/eor x28, x27, x26 + let alu_op = match op { + AtomicRmwOp::Add => ALUOp::Add64, + AtomicRmwOp::Sub => ALUOp::Sub64, + AtomicRmwOp::And => ALUOp::And64, + AtomicRmwOp::Or => ALUOp::Orr64, + AtomicRmwOp::Xor => ALUOp::Eor64, + AtomicRmwOp::Nand + | AtomicRmwOp::Umin + | AtomicRmwOp::Umax + | AtomicRmwOp::Smin + | AtomicRmwOp::Smax + | AtomicRmwOp::Xchg => unreachable!(), + }; + + Inst::AluRRR { + alu_op, + rd: x28wr, + rn: x27, + rm: x26, + } + .emit(sink, emit_info, state); + } } let srcloc = state.cur_srcloc(); diff --git a/cranelift/filetests/filetests/runtests/atomic-rmw-2.clif b/cranelift/filetests/filetests/runtests/atomic-rmw-2.clif new file mode 100644 index 0000000000..2213c72be3 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/atomic-rmw-2.clif @@ -0,0 +1,237 @@ +test run +target aarch64 +target x86_64 machinst +; TODO: Merge this with atomic-rmw.clif when s390x supports it + + +function %atomic_rmw_nand_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 nand v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_nand_i64(0, 0) == -1 +; run: %atomic_rmw_nand_i64(1, 0) == -1 +; run: %atomic_rmw_nand_i64(0, 1) == -1 +; run: %atomic_rmw_nand_i64(1, 1) == -2 +; run: %atomic_rmw_nand_i64(0xC0FFEEEE_DECAFFFF, 0x7DCB5691_7DCB5691) == 0xBF34B97F_A335A96E + +function %atomic_rmw_nand_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 nand v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} +; run: %atomic_rmw_nand_i32(0, 0) == -1 +; run: %atomic_rmw_nand_i32(1, 0) == -1 +; run: %atomic_rmw_nand_i32(0, 1) == -1 +; run: %atomic_rmw_nand_i32(1, 1) == -2 +; run: %atomic_rmw_nand_i32(0xC0FFEEEE, 0x7DCB5691) == 0xBF34B97F + + + +function %atomic_rmw_umin_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 umin v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_umin_i64(0, 0) == 0 +; run: %atomic_rmw_umin_i64(1, 0) == 0 +; run: %atomic_rmw_umin_i64(0, 1) == 0 +; run: %atomic_rmw_umin_i64(1, 1) == 1 +; run: %atomic_rmw_umin_i64(-1, 1) == 1 +; run: %atomic_rmw_umin_i64(-1, -3) == -3 + +function %atomic_rmw_umin_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 umin v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} +; run: %atomic_rmw_umin_i32(0, 0) == 0 +; run: %atomic_rmw_umin_i32(1, 0) == 0 +; run: %atomic_rmw_umin_i32(0, 1) == 0 +; run: %atomic_rmw_umin_i32(1, 1) == 1 +; run: %atomic_rmw_umin_i32(-1, 1) == 1 +; run: %atomic_rmw_umin_i32(-1, -3) == -3 + + + +function %atomic_rmw_umax_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 umax v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_umax_i64(0, 0) == 0 +; run: %atomic_rmw_umax_i64(1, 0) == 1 +; run: %atomic_rmw_umax_i64(0, 1) == 1 +; run: %atomic_rmw_umax_i64(1, 1) == 1 +; run: %atomic_rmw_umax_i64(-1, 1) == -1 +; run: %atomic_rmw_umax_i64(-1, -3) == -1 + +function %atomic_rmw_umax_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 umax v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} +; run: %atomic_rmw_umax_i32(0, 0) == 0 +; run: %atomic_rmw_umax_i32(1, 0) == 1 +; run: %atomic_rmw_umax_i32(0, 1) == 1 +; run: %atomic_rmw_umax_i32(1, 1) == 1 +; run: %atomic_rmw_umax_i32(-1, 1) == -1 +; run: %atomic_rmw_umax_i32(-1, -3) == -1 + + + +function %atomic_rmw_smin_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 smin v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_smin_i64(0, 0) == 0 +; run: %atomic_rmw_smin_i64(1, 0) == 0 +; run: %atomic_rmw_smin_i64(0, 1) == 0 +; run: %atomic_rmw_smin_i64(1, 1) == 1 +; run: %atomic_rmw_smin_i64(-1, 1) == -1 +; run: %atomic_rmw_smin_i64(-1, -3) == -3 + +function %atomic_rmw_smin_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 smin v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} +; run: %atomic_rmw_smin_i32(0, 0) == 0 +; run: %atomic_rmw_smin_i32(1, 0) == 0 +; run: %atomic_rmw_smin_i32(0, 1) == 0 +; run: %atomic_rmw_smin_i32(1, 1) == 1 +; run: %atomic_rmw_smin_i32(-1, -1) == -1 +; run: %atomic_rmw_smin_i32(-1, -3) == -3 + + + +function %atomic_rmw_smax_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 smax v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_smax_i64(0, 0) == 0 +; run: %atomic_rmw_smax_i64(1, 0) == 1 +; run: %atomic_rmw_smax_i64(0, 1) == 1 +; run: %atomic_rmw_smax_i64(1, 1) == 1 +; run: %atomic_rmw_smax_i64(-1, 1) == 1 +; run: %atomic_rmw_smax_i64(-1, -3) == -1 + +function %atomic_rmw_smax_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 smax v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} +; run: %atomic_rmw_smax_i32(0, 0) == 0 +; run: %atomic_rmw_smax_i32(1, 0) == 1 +; run: %atomic_rmw_smax_i32(0, 1) == 1 +; run: %atomic_rmw_smax_i32(1, 1) == 1 +; run: %atomic_rmw_smax_i32(-1, 1) == 1 +; run: %atomic_rmw_smax_i32(-1, -3) == -1 + + + +function %atomic_rmw_xchg_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 xchg v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_xchg_i64(0, 0) == 0 +; run: %atomic_rmw_xchg_i64(1, 0) == 0 +; run: %atomic_rmw_xchg_i64(0, 1) == 1 +; run: %atomic_rmw_xchg_i64(0, 0xC0FFEEEE_DECAFFFF) == 0xC0FFEEEE_DECAFFFF + +function %atomic_rmw_xchg_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 xchg v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} +; run: %atomic_rmw_xchg_i32(0, 0) == 0 +; run: %atomic_rmw_xchg_i32(1, 0) == 0 +; run: %atomic_rmw_xchg_i32(0, 1) == 1 +; run: %atomic_rmw_xchg_i32(0, 0xC0FFEEEE) == 0xC0FFEEEE diff --git a/cranelift/filetests/filetests/runtests/atomic-rmw.clif b/cranelift/filetests/filetests/runtests/atomic-rmw.clif new file mode 100644 index 0000000000..eb9ed2c4d3 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/atomic-rmw.clif @@ -0,0 +1,197 @@ +test run +target aarch64 +target x86_64 machinst +target s390x + +; We can't test that these instructions are right regarding atomicity, but we can +; test if they perform their operation correctly + +function %atomic_rmw_add_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 add v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_add_i64(0, 0) == 0 +; run: %atomic_rmw_add_i64(1, 0) == 1 +; run: %atomic_rmw_add_i64(0, 1) == 1 +; run: %atomic_rmw_add_i64(1, 1) == 2 +; run: %atomic_rmw_add_i64(0xC0FFEEEE_C0FFEEEE, 0x1DCB1111_1DCB1111) == 0xDECAFFFF_DECAFFFF + +function %atomic_rmw_add_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 add v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} +; run: %atomic_rmw_add_i32(0, 0) == 0 +; run: %atomic_rmw_add_i32(1, 0) == 1 +; run: %atomic_rmw_add_i32(0, 1) == 1 +; run: %atomic_rmw_add_i32(1, 1) == 2 +; run: %atomic_rmw_add_i32(0xC0FFEEEE, 0x1DCB1111) == 0xDECAFFFF + + + +function %atomic_rmw_sub_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 sub v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_sub_i64(0, 0) == 0 +; run: %atomic_rmw_sub_i64(1, 0) == 1 +; run: %atomic_rmw_sub_i64(0, 1) == -1 +; run: %atomic_rmw_sub_i64(1, 1) == 0 +; run: %atomic_rmw_sub_i64(0xDECAFFFF_DECAFFFF, 0x1DCB1111_1DCB1111) == 0xC0FFEEEE_C0FFEEEE + +function %atomic_rmw_sub_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 sub v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} +; run: %atomic_rmw_sub_i32(0, 0) == 0 +; run: %atomic_rmw_sub_i32(1, 0) == 1 +; run: %atomic_rmw_sub_i32(0, 1) == -1 +; run: %atomic_rmw_sub_i32(1, 1) == 0 +; run: %atomic_rmw_sub_i32(0xDECAFFFF, 0x1DCB1111) == 0xC0FFEEEE + + + +function %atomic_rmw_and_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 and v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_and_i64(0, 0) == 0 +; run: %atomic_rmw_and_i64(1, 0) == 0 +; run: %atomic_rmw_and_i64(0, 1) == 0 +; run: %atomic_rmw_and_i64(1, 1) == 1 +; run: %atomic_rmw_and_i64(0xF1FFFEFE_FEEEFFFF, 0xCEFFEFEF_DFDBFFFF) == 0xC0FFEEEE_DECAFFFF + +function %atomic_rmw_and_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 and v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} + +; run: %atomic_rmw_and_i64(0, 0) == 0 +; run: %atomic_rmw_and_i64(1, 0) == 0 +; run: %atomic_rmw_and_i64(0, 1) == 0 +; run: %atomic_rmw_and_i64(1, 1) == 1 +; run: %atomic_rmw_and_i64(0xF1FFFEFE, 0xCEFFEFEF) == 0xC0FFEEEE + + + +function %atomic_rmw_or_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 or v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_or_i64(0, 0) == 0 +; run: %atomic_rmw_or_i64(1, 0) == 1 +; run: %atomic_rmw_or_i64(0, 1) == 1 +; run: %atomic_rmw_or_i64(1, 1) == 1 +; run: %atomic_rmw_or_i64(0x80AAAAAA_8A8AAAAA, 0x40554444_54405555) == 0xC0FFEEEE_DECAFFFF + +function %atomic_rmw_or_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 or v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} + +; run: %atomic_rmw_or_i32(0, 0) == 0 +; run: %atomic_rmw_or_i32(1, 0) == 1 +; run: %atomic_rmw_or_i32(0, 1) == 1 +; run: %atomic_rmw_or_i32(1, 1) == 1 +; run: %atomic_rmw_or_i32(0x80AAAAAA, 0x40554444) == 0xC0FFEEEE + + + +function %atomic_rmw_xor_i64(i64, i64) -> i64 { + ss0 = explicit_slot 8 + +block0(v0: i64, v1: i64): + stack_store.i64 v0, ss0 + + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i64 xor v2, v1 + + v4 = stack_load.i64 ss0 + return v4 +} +; run: %atomic_rmw_xor_i64(0, 0) == 0 +; run: %atomic_rmw_xor_i64(1, 0) == 1 +; run: %atomic_rmw_xor_i64(0, 1) == 1 +; run: %atomic_rmw_xor_i64(1, 1) == 0 +; run: %atomic_rmw_xor_i64(0x8FA50A64_9440A07D, 0x4F5AE48A_4A8A5F82) == 0xC0FFEEEE_DECAFFFF + +function %atomic_rmw_xor_i32(i32, i32) -> i32 { + ss0 = explicit_slot 4 + +block0(v0: i32, v1: i32): + stack_store.i32 v0, ss0 + + v2 = stack_addr.i32 ss0 + v3 = atomic_rmw.i32 xor v2, v1 + + v4 = stack_load.i32 ss0 + return v4 +} +; run: %atomic_rmw_xor_i32(0, 0) == 0 +; run: %atomic_rmw_xor_i32(1, 0) == 1 +; run: %atomic_rmw_xor_i32(0, 1) == 1 +; run: %atomic_rmw_xor_i32(1, 1) == 0 +; run: %atomic_rmw_xor_i32(0x8FA50A64, 0x4F5AE48A) == 0xC0FFEEEE