aarch64: Implement missing atomic rmw ops

This commit is contained in:
Afonso Bordado
2021-06-24 19:44:55 +01:00
parent 1047c4e156
commit e85eb77c45
3 changed files with 520 additions and 19 deletions

View File

@@ -1273,6 +1273,8 @@ impl MachInstEmit for Inst {
mov x28, x26 mov x28, x26
so that we simply write in the destination, the "2nd arg for op". so that we simply write in the destination, the "2nd arg for op".
*/ */
// TODO: We should not hardcode registers here, a better idea would be to
// pass some scratch registers in the AtomicRMW pseudo-instruction, and use those
let xzr = zero_reg(); let xzr = zero_reg();
let x24 = xreg(24); let x24 = xreg(24);
let x25 = xreg(25); let x25 = xreg(25);
@@ -1294,25 +1296,90 @@ impl MachInstEmit for Inst {
} }
sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25] sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]
if op == inst_common::AtomicRmwOp::Xchg { match op {
AtomicRmwOp::Xchg => {
// mov x28, x26 // mov x28, x26
sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x28wr, xzr, x26)) Inst::Mov64 { rd: x28wr, rm: x26 }.emit(sink, emit_info, state);
} else { }
// add/sub/and/orr/eor x28, x27, x26 AtomicRmwOp::Nand => {
let bits_31_21 = match op { // and x28, x27, x26
inst_common::AtomicRmwOp::Add => 0b100_01011_00_0, // mvn x28, x28
inst_common::AtomicRmwOp::Sub => 0b110_01011_00_0,
inst_common::AtomicRmwOp::And => 0b100_01010_00_0, Inst::AluRRR {
inst_common::AtomicRmwOp::Or => 0b101_01010_00_0, alu_op: ALUOp::And64,
inst_common::AtomicRmwOp::Xor => 0b110_01010_00_0, rd: x28wr,
inst_common::AtomicRmwOp::Nand rn: x27,
| inst_common::AtomicRmwOp::Umin rm: x26,
| inst_common::AtomicRmwOp::Umax }
| inst_common::AtomicRmwOp::Smin .emit(sink, emit_info, state);
| inst_common::AtomicRmwOp::Smax => todo!("{:?}", op),
inst_common::AtomicRmwOp::Xchg => unreachable!(), Inst::AluRRR {
alu_op: ALUOp::OrrNot64,
rd: x28wr,
rn: xzr,
rm: x28,
}
.emit(sink, emit_info, state);
}
AtomicRmwOp::Umin
| AtomicRmwOp::Umax
| AtomicRmwOp::Smin
| AtomicRmwOp::Smax => {
// cmp x27, x26
// csel.op x28, x27, x26
let cond = match op {
AtomicRmwOp::Umin => Cond::Lo,
AtomicRmwOp::Umax => Cond::Hi,
AtomicRmwOp::Smin => Cond::Lt,
AtomicRmwOp::Smax => Cond::Gt,
_ => unreachable!(),
}; };
sink.put4(enc_arith_rrr(bits_31_21, 0b000000, x28wr, x27, x26));
Inst::AluRRR {
alu_op: if ty == I64 {
ALUOp::SubS64
} else {
ALUOp::SubS32
},
rd: writable_zero_reg(),
rn: x27,
rm: x26,
}
.emit(sink, emit_info, state);
Inst::CSel {
cond,
rd: x28wr,
rn: x27,
rm: x26,
}
.emit(sink, emit_info, state);
}
_ => {
// add/sub/and/orr/eor x28, x27, x26
let alu_op = match op {
AtomicRmwOp::Add => ALUOp::Add64,
AtomicRmwOp::Sub => ALUOp::Sub64,
AtomicRmwOp::And => ALUOp::And64,
AtomicRmwOp::Or => ALUOp::Orr64,
AtomicRmwOp::Xor => ALUOp::Eor64,
AtomicRmwOp::Nand
| AtomicRmwOp::Umin
| AtomicRmwOp::Umax
| AtomicRmwOp::Smin
| AtomicRmwOp::Smax
| AtomicRmwOp::Xchg => unreachable!(),
};
Inst::AluRRR {
alu_op,
rd: x28wr,
rn: x27,
rm: x26,
}
.emit(sink, emit_info, state);
}
} }
let srcloc = state.cur_srcloc(); let srcloc = state.cur_srcloc();

View File

@@ -0,0 +1,237 @@
test run
target aarch64
target x86_64 machinst
; TODO: Merge this with atomic-rmw.clif when s390x supports it
function %atomic_rmw_nand_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 nand v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_nand_i64(0, 0) == -1
; run: %atomic_rmw_nand_i64(1, 0) == -1
; run: %atomic_rmw_nand_i64(0, 1) == -1
; run: %atomic_rmw_nand_i64(1, 1) == -2
; run: %atomic_rmw_nand_i64(0xC0FFEEEE_DECAFFFF, 0x7DCB5691_7DCB5691) == 0xBF34B97F_A335A96E
function %atomic_rmw_nand_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 nand v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_nand_i32(0, 0) == -1
; run: %atomic_rmw_nand_i32(1, 0) == -1
; run: %atomic_rmw_nand_i32(0, 1) == -1
; run: %atomic_rmw_nand_i32(1, 1) == -2
; run: %atomic_rmw_nand_i32(0xC0FFEEEE, 0x7DCB5691) == 0xBF34B97F
function %atomic_rmw_umin_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 umin v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_umin_i64(0, 0) == 0
; run: %atomic_rmw_umin_i64(1, 0) == 0
; run: %atomic_rmw_umin_i64(0, 1) == 0
; run: %atomic_rmw_umin_i64(1, 1) == 1
; run: %atomic_rmw_umin_i64(-1, 1) == 1
; run: %atomic_rmw_umin_i64(-1, -3) == -3
function %atomic_rmw_umin_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 umin v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_umin_i32(0, 0) == 0
; run: %atomic_rmw_umin_i32(1, 0) == 0
; run: %atomic_rmw_umin_i32(0, 1) == 0
; run: %atomic_rmw_umin_i32(1, 1) == 1
; run: %atomic_rmw_umin_i32(-1, 1) == 1
; run: %atomic_rmw_umin_i32(-1, -3) == -3
function %atomic_rmw_umax_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 umax v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_umax_i64(0, 0) == 0
; run: %atomic_rmw_umax_i64(1, 0) == 1
; run: %atomic_rmw_umax_i64(0, 1) == 1
; run: %atomic_rmw_umax_i64(1, 1) == 1
; run: %atomic_rmw_umax_i64(-1, 1) == -1
; run: %atomic_rmw_umax_i64(-1, -3) == -1
function %atomic_rmw_umax_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 umax v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_umax_i32(0, 0) == 0
; run: %atomic_rmw_umax_i32(1, 0) == 1
; run: %atomic_rmw_umax_i32(0, 1) == 1
; run: %atomic_rmw_umax_i32(1, 1) == 1
; run: %atomic_rmw_umax_i32(-1, 1) == -1
; run: %atomic_rmw_umax_i32(-1, -3) == -1
function %atomic_rmw_smin_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 smin v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_smin_i64(0, 0) == 0
; run: %atomic_rmw_smin_i64(1, 0) == 0
; run: %atomic_rmw_smin_i64(0, 1) == 0
; run: %atomic_rmw_smin_i64(1, 1) == 1
; run: %atomic_rmw_smin_i64(-1, 1) == -1
; run: %atomic_rmw_smin_i64(-1, -3) == -3
function %atomic_rmw_smin_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 smin v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_smin_i32(0, 0) == 0
; run: %atomic_rmw_smin_i32(1, 0) == 0
; run: %atomic_rmw_smin_i32(0, 1) == 0
; run: %atomic_rmw_smin_i32(1, 1) == 1
; run: %atomic_rmw_smin_i32(-1, -1) == -1
; run: %atomic_rmw_smin_i32(-1, -3) == -3
function %atomic_rmw_smax_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 smax v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_smax_i64(0, 0) == 0
; run: %atomic_rmw_smax_i64(1, 0) == 1
; run: %atomic_rmw_smax_i64(0, 1) == 1
; run: %atomic_rmw_smax_i64(1, 1) == 1
; run: %atomic_rmw_smax_i64(-1, 1) == 1
; run: %atomic_rmw_smax_i64(-1, -3) == -1
function %atomic_rmw_smax_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 smax v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_smax_i32(0, 0) == 0
; run: %atomic_rmw_smax_i32(1, 0) == 1
; run: %atomic_rmw_smax_i32(0, 1) == 1
; run: %atomic_rmw_smax_i32(1, 1) == 1
; run: %atomic_rmw_smax_i32(-1, 1) == 1
; run: %atomic_rmw_smax_i32(-1, -3) == -1
function %atomic_rmw_xchg_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 xchg v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_xchg_i64(0, 0) == 0
; run: %atomic_rmw_xchg_i64(1, 0) == 0
; run: %atomic_rmw_xchg_i64(0, 1) == 1
; run: %atomic_rmw_xchg_i64(0, 0xC0FFEEEE_DECAFFFF) == 0xC0FFEEEE_DECAFFFF
function %atomic_rmw_xchg_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 xchg v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_xchg_i32(0, 0) == 0
; run: %atomic_rmw_xchg_i32(1, 0) == 0
; run: %atomic_rmw_xchg_i32(0, 1) == 1
; run: %atomic_rmw_xchg_i32(0, 0xC0FFEEEE) == 0xC0FFEEEE

View File

@@ -0,0 +1,197 @@
test run
target aarch64
target x86_64 machinst
target s390x
; We can't test that these instructions are right regarding atomicity, but we can
; test if they perform their operation correctly
function %atomic_rmw_add_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 add v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_add_i64(0, 0) == 0
; run: %atomic_rmw_add_i64(1, 0) == 1
; run: %atomic_rmw_add_i64(0, 1) == 1
; run: %atomic_rmw_add_i64(1, 1) == 2
; run: %atomic_rmw_add_i64(0xC0FFEEEE_C0FFEEEE, 0x1DCB1111_1DCB1111) == 0xDECAFFFF_DECAFFFF
function %atomic_rmw_add_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 add v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_add_i32(0, 0) == 0
; run: %atomic_rmw_add_i32(1, 0) == 1
; run: %atomic_rmw_add_i32(0, 1) == 1
; run: %atomic_rmw_add_i32(1, 1) == 2
; run: %atomic_rmw_add_i32(0xC0FFEEEE, 0x1DCB1111) == 0xDECAFFFF
function %atomic_rmw_sub_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 sub v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_sub_i64(0, 0) == 0
; run: %atomic_rmw_sub_i64(1, 0) == 1
; run: %atomic_rmw_sub_i64(0, 1) == -1
; run: %atomic_rmw_sub_i64(1, 1) == 0
; run: %atomic_rmw_sub_i64(0xDECAFFFF_DECAFFFF, 0x1DCB1111_1DCB1111) == 0xC0FFEEEE_C0FFEEEE
function %atomic_rmw_sub_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 sub v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_sub_i32(0, 0) == 0
; run: %atomic_rmw_sub_i32(1, 0) == 1
; run: %atomic_rmw_sub_i32(0, 1) == -1
; run: %atomic_rmw_sub_i32(1, 1) == 0
; run: %atomic_rmw_sub_i32(0xDECAFFFF, 0x1DCB1111) == 0xC0FFEEEE
function %atomic_rmw_and_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 and v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_and_i64(0, 0) == 0
; run: %atomic_rmw_and_i64(1, 0) == 0
; run: %atomic_rmw_and_i64(0, 1) == 0
; run: %atomic_rmw_and_i64(1, 1) == 1
; run: %atomic_rmw_and_i64(0xF1FFFEFE_FEEEFFFF, 0xCEFFEFEF_DFDBFFFF) == 0xC0FFEEEE_DECAFFFF
function %atomic_rmw_and_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 and v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_and_i64(0, 0) == 0
; run: %atomic_rmw_and_i64(1, 0) == 0
; run: %atomic_rmw_and_i64(0, 1) == 0
; run: %atomic_rmw_and_i64(1, 1) == 1
; run: %atomic_rmw_and_i64(0xF1FFFEFE, 0xCEFFEFEF) == 0xC0FFEEEE
function %atomic_rmw_or_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 or v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_or_i64(0, 0) == 0
; run: %atomic_rmw_or_i64(1, 0) == 1
; run: %atomic_rmw_or_i64(0, 1) == 1
; run: %atomic_rmw_or_i64(1, 1) == 1
; run: %atomic_rmw_or_i64(0x80AAAAAA_8A8AAAAA, 0x40554444_54405555) == 0xC0FFEEEE_DECAFFFF
function %atomic_rmw_or_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 or v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_or_i32(0, 0) == 0
; run: %atomic_rmw_or_i32(1, 0) == 1
; run: %atomic_rmw_or_i32(0, 1) == 1
; run: %atomic_rmw_or_i32(1, 1) == 1
; run: %atomic_rmw_or_i32(0x80AAAAAA, 0x40554444) == 0xC0FFEEEE
function %atomic_rmw_xor_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8
block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0
v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 xor v2, v1
v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_xor_i64(0, 0) == 0
; run: %atomic_rmw_xor_i64(1, 0) == 1
; run: %atomic_rmw_xor_i64(0, 1) == 1
; run: %atomic_rmw_xor_i64(1, 1) == 0
; run: %atomic_rmw_xor_i64(0x8FA50A64_9440A07D, 0x4F5AE48A_4A8A5F82) == 0xC0FFEEEE_DECAFFFF
function %atomic_rmw_xor_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4
block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0
v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 xor v2, v1
v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_xor_i32(0, 0) == 0
; run: %atomic_rmw_xor_i32(1, 0) == 1
; run: %atomic_rmw_xor_i32(0, 1) == 1
; run: %atomic_rmw_xor_i32(1, 1) == 0
; run: %atomic_rmw_xor_i32(0x8FA50A64, 0x4F5AE48A) == 0xC0FFEEEE