From 7a3df7dcc05d4ce79c2a24998b91e461d634a9a5 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Tue, 21 Mar 2023 23:15:14 +0000 Subject: [PATCH] riscv64: Improve `ctz`/`clz`/`cls` codegen (#5854) * cranelift: Add extra runtests for `clz`/`ctz` * riscv64: Restrict lowering rules for `ctz`/`clz` * cranelift: Add `u64` isle helpers * riscv64: Improve `ctz` codegen * riscv64: Improve `clz` codegen * riscv64: Improve `cls` codegen * riscv64: Improve `clz.i128` codegen Instead of checking if we have 64 zeros in the top half. Check if it *is* 0, that way we avoid loading the `64` constant. * riscv64: Improve `ctz.i128` codegen Instead of checking if we have 64 zeros in the bottom half. Check if it *is* 0, that way we avoid loading the `64` constant. * riscv64: Use extended value in `lower_cls` * riscv64: Use pattern matches on `bseti` --- cranelift/codegen/src/isa/riscv64/inst.isle | 199 +++++++++--------- cranelift/codegen/src/isa/riscv64/lower.isle | 6 +- cranelift/codegen/src/isle_prelude.rs | 5 + cranelift/codegen/src/prelude.isle | 3 + .../filetests/isa/riscv64/bitops.clif | 114 +++++----- .../filetests/isa/riscv64/cls-zbb.clif | 162 ++++++++++++++ .../filetests/isa/riscv64/clz-zbb.clif | 103 +++++++++ .../filetests/isa/riscv64/ctz-zbb-zbs.clif | 71 +++++++ .../filetests/isa/riscv64/ctz-zbb.clif | 102 +++++++++ .../filetests/filetests/runtests/cls.clif | 1 + .../filetests/filetests/runtests/clz.clif | 5 + .../filetests/filetests/runtests/ctz.clif | 8 +- .../filetests/runtests/i128-bitops-count.clif | 2 + .../filetests/runtests/i128-cls.clif | 3 +- 14 files changed, 617 insertions(+), 167 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/cls-zbb.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/clz-zbb.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/ctz-zbb-zbs.clif create mode 100644 cranelift/filetests/filetests/isa/riscv64/ctz-zbb.clif diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index 5eae843012..d4eed1f988 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -808,6 +808,28 @@ (decl imm12_from_u64 (Imm12) u64) (extern extractor imm12_from_u64 imm12_from_u64) + +;;;; Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +;; bseti: Set a single bit in a register, indexed by a constant. +(decl bseti (Reg u64) Reg) +(rule (bseti val bit) + (if-let $false (has_zbs)) + (if-let $false (u64_le bit 12)) + (let ((const Reg (load_u64_constant (u64_shl 1 bit)))) + (alu_rrr (AluOPRRR.Or) val const))) + +(rule (bseti val bit) + (if-let $false (has_zbs)) + (if-let $true (u64_le bit 12)) + (alu_rr_imm12 (AluOPRRI.Ori) val (imm12_const (u64_as_i32 (u64_shl 1 bit))))) + +(rule (bseti val bit) + (if-let $true (has_zbs)) + (alu_rr_imm12 (AluOPRRI.Bseti) val (imm12_const (u64_as_i32 bit)))) + + ;; Float Helpers (decl gen_default_frm () OptionFloatRoundingMode) @@ -948,89 +970,103 @@ (decl lower_ctz (Type Reg) Reg) -(rule - (lower_ctz ty x) - (if-let $false (has_zbb)) +(rule (lower_ctz ty x) (gen_cltz $false x ty)) -(rule 2 - (lower_ctz $I64 x) +(rule 1 (lower_ctz (fits_in_16 ty) x) (if-let $true (has_zbb)) - (alu_rr_funct12 (AluOPRRI.Ctz) x)) + (let ((tmp Reg (bseti x (ty_bits ty)))) + (alu_rr_funct12 (AluOPRRI.Ctzw) tmp))) -(rule 2 - (lower_ctz $I32 x) +(rule 2 (lower_ctz $I32 x) (if-let $true (has_zbb)) (alu_rr_funct12 (AluOPRRI.Ctzw) x)) -;;;; for I8 and I16 -(rule 1 - (lower_ctz ty x) +(rule 2 (lower_ctz $I64 x) (if-let $true (has_zbb)) - (if-let $true (has_zbs)) - (let - ((tmp Reg (alu_rr_imm12 (AluOPRRI.Bseti) x (imm12_const (ty_bits ty))))) - (alu_rr_funct12 (AluOPRRI.Ctzw) x))) + (alu_rr_funct12 (AluOPRRI.Ctz) x)) -;;;; +;; Count trailing zeros from a i128 bit value. +;; We count both halves separately and conditionally add them if it makes sense. (decl lower_ctz_128 (ValueRegs) ValueRegs) -(rule - (lower_ctz_128 x) - (let - (;; count the low part. - (low Reg (lower_ctz $I64 (value_regs_get x 0))) - ;; count the high part. - (high_part Reg (lower_ctz $I64 (value_regs_get x 1))) - ;;; - (constant_64 Reg (load_u64_constant 64)) - ;;; - (high Reg (gen_select_reg (IntCC.Equal) constant_64 low high_part (zero_reg))) +(rule (lower_ctz_128 x) + (let ((x_lo Reg (value_regs_get x 0)) + (x_hi Reg (value_regs_get x 1)) + ;; Count both halves + (high Reg (lower_ctz $I64 x_hi)) + (low Reg (lower_ctz $I64 x_lo)) + ;; Only add the top half if the bottom is zero + (high Reg (gen_select_reg (IntCC.Equal) x_lo (zero_reg) high (zero_reg))) + (result Reg (alu_add low high))) + (zext result $I64 $I128))) + - ;; add low and high together. - (result Reg (alu_add low high))) - (value_regs result (load_u64_constant 0)))) (decl lower_clz (Type Reg) Reg) -(rule - (lower_clz ty rs) - (if-let $false (has_zbb)) +(rule (lower_clz ty rs) (gen_cltz $true rs ty)) -(rule 2 - (lower_clz $I64 r) + +(rule 1 (lower_clz (fits_in_16 ty) r) (if-let $true (has_zbb)) - (alu_rr_funct12 (AluOPRRI.Clz) r)) -(rule 2 - (lower_clz $I32 r) + (let ((tmp Reg (zext r ty $I64)) + (count Reg (alu_rr_funct12 (AluOPRRI.Clz) tmp)) + ;; We always do the operation on the full 64-bit register, so subtract 64 from the result. + (result Reg (alu_rr_imm12 (AluOPRRI.Addi) count (imm12_const_add (ty_bits ty) -64)))) + result)) + +(rule 2 (lower_clz $I32 r) (if-let $true (has_zbb)) (alu_rr_funct12 (AluOPRRI.Clzw) r)) -;;; for I8 and I16 -(rule 1 - (lower_clz ty r) +(rule 2 (lower_clz $I64 r) (if-let $true (has_zbb)) - (let - ( ;; narrow int make all upper bits are zeros. - (tmp Reg (ext_int_if_need $false r ty )) - ;; - (count Reg (alu_rr_funct12 (AluOPRRI.Clz) tmp)) - ;;make result - (result Reg (alu_rr_imm12 (AluOPRRI.Addi) count (imm12_const_add (ty_bits ty) -64)))) - result)) + (alu_rr_funct12 (AluOPRRI.Clz) r)) +;; Count leading zeros from a i128 bit value. +;; We count both halves separately and conditionally add them if it makes sense. (decl lower_clz_i128 (ValueRegs) ValueRegs) -(rule - (lower_clz_i128 x) - (let - ( ;; count high part. - (high Reg (lower_clz $I64 (value_regs_get x 1))) - ;; coumt low part. - (low_part Reg (lower_clz $I64 (value_regs_get x 0))) - ;;; load constant 64. - (constant_64 Reg (load_u64_constant 64)) - (low Reg (gen_select_reg (IntCC.Equal) constant_64 high low_part (zero_reg))) - ;; add low and high together. - (result Reg (alu_add high low))) - (value_regs result (load_u64_constant 0)))) +(rule (lower_clz_i128 x) + (let ((x_lo Reg (value_regs_get x 0)) + (x_hi Reg (value_regs_get x 1)) + ;; Count both halves + (high Reg (lower_clz $I64 x_hi)) + (low Reg (lower_clz $I64 x_lo)) + ;; Only add the bottom zeros if the top half is zero + (low Reg (gen_select_reg (IntCC.Equal) x_hi (zero_reg) low (zero_reg))) + (result Reg (alu_add high low))) + (zext result $I64 $I128))) + + +(decl lower_cls (Type Reg) Reg) +(rule (lower_cls ty r) + (let ((tmp Reg (ext_int_if_need $true r ty)) + (tmp2 Reg (gen_select_reg (IntCC.SignedLessThan) tmp (zero_reg) (gen_bit_not tmp) tmp)) + (tmp3 Reg (lower_clz ty tmp2))) + (alu_rr_imm12 (AluOPRRI.Addi) tmp3 (imm12_const -1)))) + +;; If the sign bit is set, we count the leading zeros of the inverted value. +;; Otherwise we can just count the leading zeros of the original value. +;; Subtract 1 since the sign bit does not count. +(decl lower_cls_i128 (ValueRegs) ValueRegs) +(rule (lower_cls_i128 x) + (let ((low Reg (value_regs_get x 0)) + (high Reg (value_regs_get x 1)) + (low Reg (gen_select_reg (IntCC.SignedLessThan) high (zero_reg) (gen_bit_not low) low)) + (high Reg (gen_select_reg (IntCC.SignedLessThan) high (zero_reg) (gen_bit_not high) high)) + (tmp ValueRegs (lower_clz_i128 (value_regs low high))) + (count Reg (value_regs_get tmp 0)) + (result Reg (alu_rr_imm12 (AluOPRRI.Addi) count (imm12_const -1)))) + (zext result $I64 $I128))) + + +(decl gen_cltz (bool Reg Type) Reg) +(rule (gen_cltz leading rs ty) + (let ((tmp WritableReg (temp_writable_reg $I64)) + (step WritableReg (temp_writable_reg $I64)) + (sum WritableReg (temp_writable_reg $I64)) + (_ Unit (emit (MInst.Cltz leading sum step tmp rs ty)))) + sum)) + ;; Extends an integer if it is smaller than 64 bits. (decl ext_int_if_need (bool ValueRegs Type) ValueRegs) @@ -1267,27 +1303,6 @@ (part3 Reg (gen_select_reg (IntCC.Equal) shamt (zero_reg) (zero_reg) part2))) (alu_rrr (AluOPRRR.Or) part1 part3))) -(decl lower_cls (Reg Type) Reg) -(rule - (lower_cls r ty) - (let - ( ;; extract sign bit. - (tmp Reg (ext_int_if_need $true r ty)) - ;; - (tmp2 Reg (gen_select_reg (IntCC.SignedLessThan) tmp (zero_reg) (gen_bit_not r) r)) - ;; - (tmp3 Reg (lower_clz ty tmp2))) - (alu_rr_imm12 (AluOPRRI.Addi) tmp3 (imm12_const -1)))) - -(decl gen_cltz (bool Reg Type) Reg) -(rule - (gen_cltz leading rs ty) - (let - ((tmp WritableReg (temp_writable_reg $I64)) - (step WritableReg (temp_writable_reg $I64)) - (sum WritableReg (temp_writable_reg $I64)) - (_ Unit (emit (MInst.Cltz leading sum step tmp rs ty)))) - (writable_reg_to_reg sum))) (decl gen_popcnt (Reg Type) Reg) (rule @@ -1454,24 +1469,6 @@ (gen_select_reg (IntCC.UnsignedGreaterThanOrEqual) shamt_128 const64 high_replacement high)))) -(decl lower_cls_i128 (ValueRegs) ValueRegs) -(rule - (lower_cls_i128 x) - (let - ( ;;; we use clz to implement cls - ;;; if value is negtive we need inverse all bits. - (low Reg - (gen_select_reg (IntCC.SignedLessThan) (value_regs_get x 1) (zero_reg) (gen_bit_not (value_regs_get x 0)) (value_regs_get x 0))) - ;;; - (high Reg - (gen_select_reg (IntCC.SignedLessThan) (value_regs_get x 1) (zero_reg) (gen_bit_not (value_regs_get x 1)) (value_regs_get x 1))) - ;; count leading zeros. - (tmp ValueRegs (lower_clz_i128 (value_regs low high))) - (count Reg (value_regs_get tmp 0)) - (result Reg (alu_rr_imm12 (AluOPRRI.Addi) count (imm12_const -1)))) - (value_regs result (load_u64_constant 0)))) - - (decl gen_amode (Reg Offset32 Type) AMode) (extern constructor gen_amode gen_amode) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index ec74f4555a..458b598471 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -327,14 +327,14 @@ ;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type ty (ctz x))) +(rule (lower (has_type (fits_in_64 ty) (ctz x))) (lower_ctz ty x)) (rule 1 (lower (has_type $I128 (ctz x))) (lower_ctz_128 x)) ;;;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type ty (clz x))) +(rule (lower (has_type (fits_in_64 ty) (clz x))) (lower_clz ty x)) (rule 1 (lower (has_type $I128 (clz x))) @@ -342,7 +342,7 @@ ;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (fits_in_64 ty) (cls x))) - (lower_cls x ty)) + (lower_cls ty x)) (rule 1 (lower (has_type $I128 (cls x))) (lower_cls_i128 x)) diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs index 749e8bb4ee..3de34e4366 100644 --- a/cranelift/codegen/src/isle_prelude.rs +++ b/cranelift/codegen/src/isle_prelude.rs @@ -38,6 +38,11 @@ macro_rules! isle_common_prelude_methods { x as u64 } + #[inline] + fn u64_as_i32(&mut self, x: u64) -> i32 { + x as i32 + } + #[inline] fn i64_neg(&mut self, x: i64) -> i64 { x.wrapping_neg() diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 2c19e8a849..012d5e29af 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -102,6 +102,9 @@ (decl u64_as_u32 (u32) u64) (extern extractor u64_as_u32 u64_as_u32) +(decl pure u64_as_i32 (u64) i32) +(extern constructor u64_as_i32 u64_as_i32) + ;;;; Primitive Arithmetic ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl pure u8_and (u8 u8) u8) diff --git a/cranelift/filetests/filetests/isa/riscv64/bitops.clif b/cranelift/filetests/filetests/isa/riscv64/bitops.clif index afe8949480..23aaa6ba51 100644 --- a/cranelift/filetests/filetests/isa/riscv64/bitops.clif +++ b/cranelift/filetests/filetests/isa/riscv64/bitops.clif @@ -385,28 +385,25 @@ block0(v0: i128): ; VCode: ; block0: -; mv t0,a1 -; clz a2,t0##ty=i64 tmp=a3 step=a1 +; clz a2,a1##ty=i64 tmp=a4 step=a3 ; clz a6,a0##ty=i64 tmp=a4 step=a5 -; li t3,64 -; select_reg t0,a6,zero##condition=(t3 eq a2) -; add a0,a2,t0 +; select_reg t3,a6,zero##condition=(a1 eq zero) +; add a0,a2,t3 ; li a1,0 ; ret ; ; Disassembled: ; block0: ; offset 0x0 -; ori t0, a1, 0 ; ori a2, zero, 0 -; addi a1, zero, 0x40 -; addi a3, zero, 1 -; slli a3, a3, 0x3f -; blez a1, 0x1c -; and t5, a3, t0 +; addi a3, zero, 0x40 +; addi a4, zero, 1 +; slli a4, a4, 0x3f +; blez a3, 0x1c +; and t5, a4, a1 ; bne zero, t5, 0x14 ; addi a2, a2, 1 -; addi a1, a1, -1 -; srli a3, a3, 1 +; addi a3, a3, -1 +; srli a4, a4, 1 ; j -0x18 ; ori a6, zero, 0 ; addi a5, zero, 0x40 @@ -419,12 +416,11 @@ block0(v0: i128): ; addi a5, a5, -1 ; srli a4, a4, 1 ; j -0x18 -; addi t3, zero, 0x40 -; beq t3, a2, 0xc -; ori t0, zero, 0 +; beqz a1, 0xc +; ori t3, zero, 0 ; j 8 -; ori t0, a6, 0 -; add a0, a2, t0 +; ori t3, a6, 0 +; add a0, a2, t3 ; mv a1, zero ; ret @@ -438,8 +434,8 @@ block0(v0: i8): ; block0: ; slli t2,a0,56 ; srai a1,t2,56 -; not a3,a0 -; select_reg a5,a3,a0##condition=(a1 slt zero) +; not a3,a1 +; select_reg a5,a3,a1##condition=(a1 slt zero) ; clz t4,a5##ty=i8 tmp=a7 step=t3 ; addi a0,t4,-1 ; ret @@ -448,9 +444,9 @@ block0(v0: i8): ; block0: ; offset 0x0 ; slli t2, a0, 0x38 ; srai a1, t2, 0x38 -; not a3, a0 +; not a3, a1 ; bltz a1, 0xc -; ori a5, a0, 0 +; ori a5, a1, 0 ; j 8 ; ori a5, a3, 0 ; ori t4, zero, 0 @@ -477,8 +473,8 @@ block0(v0: i16): ; block0: ; slli t2,a0,48 ; srai a1,t2,48 -; not a3,a0 -; select_reg a5,a3,a0##condition=(a1 slt zero) +; not a3,a1 +; select_reg a5,a3,a1##condition=(a1 slt zero) ; clz t4,a5##ty=i16 tmp=a7 step=t3 ; addi a0,t4,-1 ; ret @@ -487,9 +483,9 @@ block0(v0: i16): ; block0: ; offset 0x0 ; slli t2, a0, 0x30 ; srai a1, t2, 0x30 -; not a3, a0 +; not a3, a1 ; bltz a1, 0xc -; ori a5, a0, 0 +; ori a5, a1, 0 ; j 8 ; ori a5, a3, 0 ; ori t4, zero, 0 @@ -515,8 +511,8 @@ block0(v0: i32): ; VCode: ; block0: ; sext.w t2,a0 -; not a1,a0 -; select_reg a3,a1,a0##condition=(t2 slt zero) +; not a1,t2 +; select_reg a3,a1,t2##condition=(t2 slt zero) ; clz a7,a3##ty=i32 tmp=a5 step=a6 ; addi a0,a7,-1 ; ret @@ -524,9 +520,9 @@ block0(v0: i32): ; Disassembled: ; block0: ; offset 0x0 ; sext.w t2, a0 -; not a1, a0 +; not a1, t2 ; bltz t2, 0xc -; ori a3, a0, 0 +; ori a3, t2, 0 ; j 8 ; ori a3, a1, 0 ; ori a7, zero, 0 @@ -592,11 +588,10 @@ block0(v0: i128): ; select_reg a6,a4,a1##condition=(a1 slt zero) ; clz t0,a6##ty=i64 tmp=t3 step=t4 ; clz a1,a2##ty=i64 tmp=t2 step=a0 -; li a3,64 -; select_reg a5,a1,zero##condition=(a3 eq t0) -; add a7,t0,a5 -; li t4,0 -; addi a0,a7,-1 +; select_reg a3,a1,zero##condition=(a6 eq zero) +; add a5,t0,a3 +; li a7,0 +; addi a0,a5,-1 ; li a1,0 ; ret ; @@ -632,14 +627,13 @@ block0(v0: i128): ; addi a0, a0, -1 ; srli t2, t2, 1 ; j -0x18 -; addi a3, zero, 0x40 -; beq a3, t0, 0xc -; ori a5, zero, 0 +; beqz a6, 0xc +; ori a3, zero, 0 ; j 8 -; ori a5, a1, 0 -; add a7, t0, a5 -; mv t4, zero -; addi a0, a7, -1 +; ori a3, a1, 0 +; add a5, t0, a3 +; mv a7, zero +; addi a0, a5, -1 ; mv a1, zero ; ret @@ -759,44 +753,42 @@ block0(v0: i128): ; VCode: ; block0: -; mv t0,a0 -; ctz a2,t0##ty=i64 tmp=a0 step=a3 -; ctz a6,a1##ty=i64 tmp=a4 step=a5 -; li t3,64 -; select_reg t0,a6,zero##condition=(t3 eq a2) -; add a0,a2,t0 +; mv t4,a1 +; ctz a2,t4##ty=i64 tmp=a3 step=a1 +; ctz a6,a0##ty=i64 tmp=a4 step=a5 +; select_reg t3,a2,zero##condition=(a0 eq zero) +; add a0,a6,t3 ; li a1,0 ; ret ; ; Disassembled: ; block0: ; offset 0x0 -; ori t0, a0, 0 +; ori t4, a1, 0 ; ori a2, zero, 0 -; addi a3, zero, 0x40 -; addi a0, zero, 1 -; blez a3, 0x1c -; and t5, a0, t0 +; addi a1, zero, 0x40 +; addi a3, zero, 1 +; blez a1, 0x1c +; and t5, a3, t4 ; bne zero, t5, 0x14 ; addi a2, a2, 1 -; addi a3, a3, -1 -; slli a0, a0, 1 +; addi a1, a1, -1 +; slli a3, a3, 1 ; j -0x18 ; ori a6, zero, 0 ; addi a5, zero, 0x40 ; addi a4, zero, 1 ; blez a5, 0x1c -; and t5, a4, a1 +; and t5, a4, a0 ; bne zero, t5, 0x14 ; addi a6, a6, 1 ; addi a5, a5, -1 ; slli a4, a4, 1 ; j -0x18 -; addi t3, zero, 0x40 -; beq t3, a2, 0xc -; ori t0, zero, 0 +; beqz a0, 0xc +; ori t3, zero, 0 ; j 8 -; ori t0, a6, 0 -; add a0, a2, t0 +; ori t3, a2, 0 +; add a0, a6, t3 ; mv a1, zero ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/cls-zbb.clif b/cranelift/filetests/filetests/isa/riscv64/cls-zbb.clif new file mode 100644 index 0000000000..d6683cbe9d --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/cls-zbb.clif @@ -0,0 +1,162 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_zbb + + +function %cls_i8(i8) -> i8 { +block0(v0: i8): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; sext.b t2,a0 +; not a1,t2 +; select_reg a3,a1,t2##condition=(t2 slt zero) +; andi a5,a3,255 +; clz a7,a5 +; addi t4,a7,-56 +; addi a0,t4,-1 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x93, 0x13, 0x45, 0x60 +; not a1, t2 +; bltz t2, 0xc +; ori a3, t2, 0 +; j 8 +; ori a3, a1, 0 +; andi a5, a3, 0xff +; .byte 0x93, 0x98, 0x07, 0x60 +; addi t4, a7, -0x38 +; addi a0, t4, -1 +; ret + +function %cls_i16(i16) -> i16 { +block0(v0: i16): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; sext.h t2,a0 +; not a1,t2 +; select_reg a3,a1,t2##condition=(t2 slt zero) +; zext.h a5,a3 +; clz a7,a5 +; addi t4,a7,-48 +; addi a0,t4,-1 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x93, 0x13, 0x55, 0x60 +; not a1, t2 +; bltz t2, 0xc +; ori a3, t2, 0 +; j 8 +; ori a3, a1, 0 +; .byte 0xbb, 0xc7, 0x06, 0x08 +; .byte 0x93, 0x98, 0x07, 0x60 +; addi t4, a7, -0x30 +; addi a0, t4, -1 +; ret + +function %cls_i32(i32) -> i32 { +block0(v0: i32): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; sext.w t2,a0 +; not a1,t2 +; select_reg a3,a1,t2##condition=(t2 slt zero) +; clzw a5,a3 +; addi a0,a5,-1 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; sext.w t2, a0 +; not a1, t2 +; bltz t2, 0xc +; ori a3, t2, 0 +; j 8 +; ori a3, a1, 0 +; .byte 0x9b, 0x97, 0x06, 0x60 +; addi a0, a5, -1 +; ret + +function %cls_i64(i64) -> i64 { +block0(v0: i64): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; not t2,a0 +; select_reg a1,t2,a0##condition=(a0 slt zero) +; clz a3,a1 +; addi a0,a3,-1 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; not t2, a0 +; bltz a0, 0xc +; ori a1, a0, 0 +; j 8 +; ori a1, t2, 0 +; .byte 0x93, 0x96, 0x05, 0x60 +; addi a0, a3, -1 +; ret + +function %cls_i128(i128) -> i128 { +block0(v0: i128): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; not a2,a0 +; select_reg a2,a2,a0##condition=(a1 slt zero) +; not a4,a1 +; select_reg a6,a4,a1##condition=(a1 slt zero) +; clz t3,a6 +; clz t0,a2 +; select_reg t2,t0,zero##condition=(a6 eq zero) +; add a1,t3,t2 +; li a3,0 +; addi a0,a1,-1 +; li a1,0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; not a2, a0 +; bltz a1, 8 +; ori a2, a0, 0 +; not a4, a1 +; bltz a1, 0xc +; ori a6, a1, 0 +; j 8 +; ori a6, a4, 0 +; .byte 0x13, 0x1e, 0x08, 0x60 +; .byte 0x93, 0x12, 0x06, 0x60 +; beqz a6, 0xc +; ori t2, zero, 0 +; j 8 +; ori t2, t0, 0 +; add a1, t3, t2 +; mv a3, zero +; addi a0, a1, -1 +; mv a1, zero +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/clz-zbb.clif b/cranelift/filetests/filetests/isa/riscv64/clz-zbb.clif new file mode 100644 index 0000000000..ff9e83530c --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/clz-zbb.clif @@ -0,0 +1,103 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_zbb + +function %clz_i8(i8) -> i8 { +block0(v0: i8): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; andi t2,a0,255 +; clz a1,t2 +; addi a0,a1,-56 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; andi t2, a0, 0xff +; .byte 0x93, 0x95, 0x03, 0x60 +; addi a0, a1, -0x38 +; ret + +function %clz_i16(i16) -> i16 { +block0(v0: i16): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; zext.h t2,a0 +; clz a1,t2 +; addi a0,a1,-48 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xbb, 0x43, 0x05, 0x08 +; .byte 0x93, 0x95, 0x03, 0x60 +; addi a0, a1, -0x30 +; ret + +function %clz_i32(i32) -> i32 { +block0(v0: i32): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; clzw a0,a0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x1b, 0x15, 0x05, 0x60 +; ret + +function %clz_i64(i64) -> i64 { +block0(v0: i64): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; clz a0,a0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x13, 0x15, 0x05, 0x60 +; ret + +function %clz_i128(i128) -> i128 { +block0(v0: i128): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; clz a2,a1 +; clz a3,a0 +; select_reg a4,a3,zero##condition=(a1 eq zero) +; add a0,a2,a4 +; li a1,0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x13, 0x96, 0x05, 0x60 +; .byte 0x93, 0x16, 0x05, 0x60 +; beqz a1, 0xc +; ori a4, zero, 0 +; j 8 +; ori a4, a3, 0 +; add a0, a2, a4 +; mv a1, zero +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/ctz-zbb-zbs.clif b/cranelift/filetests/filetests/isa/riscv64/ctz-zbb-zbs.clif new file mode 100644 index 0000000000..50e3137afa --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/ctz-zbb-zbs.clif @@ -0,0 +1,71 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_zbb has_zbs + +function %ctz_i8(i8) -> i8 { +block0(v0: i8): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; bseti t2,a0,8 +; ctzw a0,t2 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x93, 0x13, 0x85, 0x28 +; .byte 0x1b, 0x95, 0x13, 0x60 +; ret + +function %ctz_i16(i16) -> i16 { +block0(v0: i16): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; bseti t2,a0,16 +; ctzw a0,t2 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x93, 0x13, 0x05, 0x29 +; .byte 0x1b, 0x95, 0x13, 0x60 +; ret + +function %ctz_i32(i32) -> i32 { +block0(v0: i32): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; ctzw a0,a0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x1b, 0x15, 0x15, 0x60 +; ret + +function %ctz_i64(i64) -> i64 { +block0(v0: i64): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; ctz a0,a0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x13, 0x15, 0x15, 0x60 +; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/ctz-zbb.clif b/cranelift/filetests/filetests/isa/riscv64/ctz-zbb.clif new file mode 100644 index 0000000000..8ff7579b40 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/ctz-zbb.clif @@ -0,0 +1,102 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_zbb + + +function %ctz_i8(i8) -> i8 { +block0(v0: i8): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; ori t2,a0,256 +; ctzw a0,t2 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ori t2, a0, 0x100 +; .byte 0x1b, 0x95, 0x13, 0x60 +; ret + +function %ctz_i16(i16) -> i16 { +block0(v0: i16): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; lui t2,16 +; or a1,a0,t2 +; ctzw a0,a1 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; lui t2, 0x10 +; or a1, a0, t2 +; .byte 0x1b, 0x95, 0x15, 0x60 +; ret + +function %ctz_i32(i32) -> i32 { +block0(v0: i32): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; ctzw a0,a0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x1b, 0x15, 0x15, 0x60 +; ret + +function %ctz_i64(i64) -> i64 { +block0(v0: i64): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; ctz a0,a0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x13, 0x15, 0x15, 0x60 +; ret + +function %ctz_i128(i128) -> i128 { +block0(v0: i128): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; ctz a1,a1 +; ctz a2,a0 +; select_reg a4,a1,zero##condition=(a0 eq zero) +; add a0,a2,a4 +; li a1,0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0x93, 0x95, 0x15, 0x60 +; .byte 0x13, 0x16, 0x15, 0x60 +; beqz a0, 0xc +; ori a4, zero, 0 +; j 8 +; ori a4, a1, 0 +; add a0, a2, a4 +; mv a1, zero +; ret + diff --git a/cranelift/filetests/filetests/runtests/cls.clif b/cranelift/filetests/filetests/runtests/cls.clif index d87c261939..81e3f54ffd 100644 --- a/cranelift/filetests/filetests/runtests/cls.clif +++ b/cranelift/filetests/filetests/runtests/cls.clif @@ -2,6 +2,7 @@ test interpret test run target aarch64 target riscv64 +target riscv64 has_zbb target s390x ; not implemented on `x86_64` diff --git a/cranelift/filetests/filetests/runtests/clz.clif b/cranelift/filetests/filetests/runtests/clz.clif index 98355af698..22080462bc 100644 --- a/cranelift/filetests/filetests/runtests/clz.clif +++ b/cranelift/filetests/filetests/runtests/clz.clif @@ -5,12 +5,14 @@ target s390x target x86_64 target x86_64 has_lzcnt target riscv64 +target riscv64 has_zbb function %clz_i8(i8) -> i8 { block0(v0: i8): v1 = clz v0 return v1 } +; run: %clz_i8(0) == 8 ; run: %clz_i8(1) == 7 ; run: %clz_i8(0x40) == 1 ; run: %clz_i8(-1) == 0 @@ -20,6 +22,7 @@ block0(v0: i16): v1 = clz v0 return v1 } +; run: %clz_i16(0) == 16 ; run: %clz_i16(1) == 15 ; run: %clz_i16(0x4000) == 1 ; run: %clz_i16(-1) == 0 @@ -29,6 +32,7 @@ block0(v0: i32): v1 = clz v0 return v1 } +; run: %clz_i32(0) == 32 ; run: %clz_i32(1) == 31 ; run: %clz_i32(0x40000000) == 1 ; run: %clz_i32(-1) == 0 @@ -38,6 +42,7 @@ block0(v0: i64): v1 = clz v0 return v1 } +; run: %clz_i64(0) == 64 ; run: %clz_i64(1) == 63 ; run: %clz_i64(0x4000000000000000) == 1 ; run: %clz_i64(-1) == 0 diff --git a/cranelift/filetests/filetests/runtests/ctz.clif b/cranelift/filetests/filetests/runtests/ctz.clif index 3051638677..adef585fa2 100644 --- a/cranelift/filetests/filetests/runtests/ctz.clif +++ b/cranelift/filetests/filetests/runtests/ctz.clif @@ -3,14 +3,17 @@ test run target aarch64 target s390x target x86_64 -target riscv64 target x86_64 has_bmi1 +target riscv64 +target riscv64 has_zbb +target riscv64 has_zbb has_zbs function %ctz_i8(i8) -> i8 { block0(v0: i8): v1 = ctz v0 return v1 } +; run: %ctz_i8(0) == 8 ; run: %ctz_i8(1) == 0 ; run: %ctz_i8(0x40) == 6 ; run: %ctz_i8(-1) == 0 @@ -20,6 +23,7 @@ block0(v0: i16): v1 = ctz v0 return v1 } +; run: %ctz_i16(0) == 16 ; run: %ctz_i16(1) == 0 ; run: %ctz_i16(0x4000) == 14 ; run: %ctz_i16(-1) == 0 @@ -29,6 +33,7 @@ block0(v0: i32): v1 = ctz v0 return v1 } +; run: %ctz_i32(0) == 32 ; run: %ctz_i32(1) == 0 ; run: %ctz_i32(0x40000000) == 30 ; run: %ctz_i32(-1) == 0 @@ -38,6 +43,7 @@ block0(v0: i64): v1 = ctz v0 return v1 } +; run: %ctz_i64(0) == 64 ; run: %ctz_i64(1) == 0 ; run: %ctz_i64(0x4000000000000000) == 62 ; run: %ctz_i64(-1) == 0 diff --git a/cranelift/filetests/filetests/runtests/i128-bitops-count.clif b/cranelift/filetests/filetests/runtests/i128-bitops-count.clif index 533fdce315..a57cc43080 100644 --- a/cranelift/filetests/filetests/runtests/i128-bitops-count.clif +++ b/cranelift/filetests/filetests/runtests/i128-bitops-count.clif @@ -4,6 +4,8 @@ target aarch64 target s390x target x86_64 target riscv64 +target riscv64 has_zbb +target riscv64 has_zbb has_zbs function %ctz_i128(i128) -> i128 { block0(v0: i128): diff --git a/cranelift/filetests/filetests/runtests/i128-cls.clif b/cranelift/filetests/filetests/runtests/i128-cls.clif index cd9deac102..24fe9a172e 100644 --- a/cranelift/filetests/filetests/runtests/i128-cls.clif +++ b/cranelift/filetests/filetests/runtests/i128-cls.clif @@ -1,6 +1,7 @@ test run target aarch64 -target riscv64 +target riscv64 +target riscv64 has_zbb target s390x function %cls_i128(i128) -> i128 {