From de0e0bea3f70dc003ced9daa8bf7bb95c8bc41a6 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Mon, 6 Feb 2023 13:53:40 -0600
Subject: [PATCH] Legalize `b{and,or,xor}_not` into component instructions
 (#5709)

* Remove trailing whitespace in `lower.isle` files

* Legalize the `band_not` instruction into simpler form

This commit legalizes the `band_not` instruction into `band`-of-`bnot`,
or two instructions. This is intended to assist with egraph-based
optimizations where the `band_not` instruction doesn't have to be
specifically included in other bit-operation-patterns.

Lowerings of the `band_not` instruction have been moved to a
specialization of the `band` instruction.

* Legalize `bor_not` into components

Same as prior commit, but for the `bor_not` instruction.

* Legalize bxor_not into bxor-of-bnot

Same as prior commits. I think this also ended up fixing a bug in the
s390x backend where `bxor_not x y` was actually translated as `bnot
(bxor x y)` by accident given the test update changes.

* Simplify not-fused operands for riscv64

Looks like some delegated-to rules have special-cases for "if this
feature is enabled use the fused instruction" so move the clause for
testing the feature up to the lowering phase to help trigger other rules
if the feature isn't enabled. This should make the riscv64 backend more
consistent with how other backends are implemented.

* Remove B{and,or,xor}Not from cost of egraph metrics

These shouldn't ever reach egraphs now that they're legalized away.

* Add an egraph optimization for `x^-1 => ~x`

This adds a simplification node to translate xor-against-minus-1 to a
`bnot` instruction. This helps trigger various other optimizations in
the egraph implementation and also various backend lowering rules for
instructions. This is chiefly useful as wasm doesn't have a `bnot`
equivalent, so it's encoded as `x^-1`.

* Add a wasm test for end-to-end bitwise lowerings

Test that end-to-end various optimizations are being applied for input
wasm modules.

* Specifically don't self-update rustup on CI

I forget why this was here originally, but this is failing on Windows
CI. In general there's no need to update rustup, so leave it as-is.

* Cleanup some aarch64 lowering rules

Previously a 32/64 split was necessary due to the `ALUOp` being different
but that's been refactored away no so there's no longer any need for
duplicate rules.

* Narrow a x64 lowering rule

This previously made more sense when it was `band_not` and rarely used,
but be more specific in the type-filter on this rule that it's only
applicable to SIMD types with lanes.

* Simplify xor-against-minus-1 rule

No need to have the commutative version since constants are already
shuffled right for egraphs

* Optimize band-of-bnot when bnot is on the left

Use some more rules in the egraph algebraic optimizations to
canonicalize band/bor/bxor with a `bnot` operand to put the operand on
the right. That way the lowerings in the backends only have to list the
rule once, with the operand on the right, to optimize both styles of
input.

* Add commutative lowering rules

* Update cranelift/codegen/src/isa/x64/lower.isle

Co-authored-by: Jamey Sharp <jamey@minilop.net>

---------

Co-authored-by: Jamey Sharp <jamey@minilop.net>
---
 .github/actions/install-rust/action.yml       |   5 -
 cranelift/codegen/src/egraph/cost.rs          |   3 -
 cranelift/codegen/src/isa/aarch64/lower.isle  |  89 ++++++-----
 cranelift/codegen/src/isa/riscv64/inst.isle   |  30 +---
 cranelift/codegen/src/isa/riscv64/lower.isle  | 147 ++++++++++--------
 cranelift/codegen/src/isa/s390x/lower.isle    |  79 +++++-----
 cranelift/codegen/src/isa/x64/lower.isle      |  80 ++++------
 cranelift/codegen/src/legalizer/mod.rs        |  22 +++
 cranelift/codegen/src/opts/algebraic.isle     |   5 +
 .../filetests/filetests/egraph/algebraic.clif |  30 ++++
 .../isa/aarch64/bitopts-optimized.clif        |  37 +++++
 .../isa/riscv64/bitops-optimized.clif         |  45 ++++++
 .../filetests/isa/riscv64/bitops.clif         |  42 ++---
 .../filetests/isa/s390x/bitops-optimized.clif |  66 ++++++++
 .../filetests/isa/s390x/bitwise.clif          |  40 ++---
 .../filetests/isa/x64/band_not_bmi1.clif      |  17 ++
 .../filetests/filetests/wasm/i32-not-x64.wat  |  46 ++++++
 17 files changed, 506 insertions(+), 277 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/aarch64/bitopts-optimized.clif
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/bitops-optimized.clif
 create mode 100644 cranelift/filetests/filetests/isa/s390x/bitops-optimized.clif
 create mode 100644 cranelift/filetests/filetests/wasm/i32-not-x64.wat

diff --git a/.github/actions/install-rust/action.yml b/.github/actions/install-rust/action.yml
index e63fcc4eb2..d8016f78e9 100644
--- a/.github/actions/install-rust/action.yml
+++ b/.github/actions/install-rust/action.yml
@@ -18,11 +18,6 @@ runs:
     - name: Install Rust
       shell: bash
       run: |
-
-        if [[ "${{ runner.os }}" = "Windows" ]]; then
-          rustup self update
-        fi
-
         rustup set profile minimal
         rustup update "${{ inputs.toolchain }}" --no-self-update
         rustup default "${{ inputs.toolchain }}"
diff --git a/cranelift/codegen/src/egraph/cost.rs b/cranelift/codegen/src/egraph/cost.rs
index fd9a22ee23..9cfb0894ca 100644
--- a/cranelift/codegen/src/egraph/cost.rs
+++ b/cranelift/codegen/src/egraph/cost.rs
@@ -85,11 +85,8 @@ pub(crate) fn pure_op_cost(op: Opcode) -> Cost {
         Opcode::Iadd
         | Opcode::Isub
         | Opcode::Band
-        | Opcode::BandNot
         | Opcode::Bor
-        | Opcode::BorNot
         | Opcode::Bxor
-        | Opcode::BxorNot
         | Opcode::Bnot
         | Opcode::Ishl
         | Opcode::Ushr
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index 1238f463aa..eccda08b60 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -580,7 +580,7 @@
       (sub ty (zero_reg) x))
 
 ;; `i128`
-(rule 2 (lower (has_type $I128 (ineg x)))  
+(rule 2 (lower (has_type $I128 (ineg x)))
       (sub_i128 (value_regs_zero) x))
 
 ;; vectors.
@@ -1054,75 +1054,74 @@
 
 ;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule -1 (lower (has_type (fits_in_32 ty) (band x y)))
+(rule -1 (lower (has_type (fits_in_64 ty) (band x y)))
       (alu_rs_imm_logic_commutative (ALUOp.And) ty x y))
 
-(rule (lower (has_type $I64 (band x y)))
-      (alu_rs_imm_logic_commutative (ALUOp.And) $I64 x y))
-
 (rule (lower (has_type $I128 (band x y))) (i128_alu_bitop (ALUOp.And) $I64 x y))
 
 (rule -2 (lower (has_type (ty_vec128 ty) (band x y)))
       (and_vec x y (vector_size ty)))
 
+;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
+;; by Cranelift's `band_not` instruction that is legalized into the simpler
+;; forms early on.
+
+(rule 1 (lower (has_type (fits_in_64 ty) (band x (bnot y))))
+      (alu_rs_imm_logic (ALUOp.AndNot) ty x y))
+(rule 2 (lower (has_type (fits_in_64 ty) (band (bnot y) x)))
+      (alu_rs_imm_logic (ALUOp.AndNot) ty x y))
+
+(rule 3 (lower (has_type $I128 (band x (bnot y)))) (i128_alu_bitop (ALUOp.AndNot) $I64 x y))
+(rule 4 (lower (has_type $I128 (band (bnot y) x))) (i128_alu_bitop (ALUOp.AndNot) $I64 x y))
+
+(rule 5 (lower (has_type (ty_vec128 ty) (band x (bnot y))))
+      (bic_vec x y (vector_size ty)))
+(rule 6 (lower (has_type (ty_vec128 ty) (band (bnot y) x)))
+      (bic_vec x y (vector_size ty)))
+
 ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule -1 (lower (has_type (fits_in_32 ty) (bor x y)))
+(rule -1 (lower (has_type (fits_in_64 ty) (bor x y)))
       (alu_rs_imm_logic_commutative (ALUOp.Orr) ty x y))
 
-(rule (lower (has_type $I64 (bor x y)))
-      (alu_rs_imm_logic_commutative (ALUOp.Orr) $I64 x y))
-
 (rule (lower (has_type $I128 (bor x y))) (i128_alu_bitop (ALUOp.Orr) $I64 x y))
 
 (rule -2 (lower (has_type (ty_vec128 ty) (bor x y)))
       (orr_vec x y (vector_size ty)))
 
+;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced
+;; by Cranelift's `bor_not` instruction that is legalized into the simpler
+;; forms early on.
+
+(rule 1 (lower (has_type (fits_in_64 ty) (bor x (bnot y))))
+      (alu_rs_imm_logic (ALUOp.OrrNot) ty x y))
+(rule 2 (lower (has_type (fits_in_64 ty) (bor (bnot y) x)))
+      (alu_rs_imm_logic (ALUOp.OrrNot) ty x y))
+
+(rule 3 (lower (has_type $I128 (bor x (bnot y)))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))
+(rule 4 (lower (has_type $I128 (bor (bnot y) x))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))
+
 ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule -1 (lower (has_type (fits_in_32 ty) (bxor x y)))
+(rule -1 (lower (has_type (fits_in_64 ty) (bxor x y)))
       (alu_rs_imm_logic_commutative (ALUOp.Eor) ty x y))
 
-(rule (lower (has_type $I64 (bxor x y)))
-      (alu_rs_imm_logic_commutative (ALUOp.Eor) $I64 x y))
-
 (rule (lower (has_type $I128 (bxor x y))) (i128_alu_bitop (ALUOp.Eor) $I64 x y))
 
 (rule -2 (lower (has_type (ty_vec128 ty) (bxor x y)))
       (eor_vec x y (vector_size ty)))
 
-;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Specialized lowerings for `(bxor x (bnot y))` which is additionally produced
+;; by Cranelift's `bxor_not` instruction that is legalized into the simpler
+;; forms early on.
 
-(rule -1 (lower (has_type (fits_in_32 ty) (band_not x y)))
-      (alu_rs_imm_logic (ALUOp.AndNot) ty x y))
+(rule 1 (lower (has_type (fits_in_64 ty) (bxor x (bnot y))))
+      (alu_rs_imm_logic (ALUOp.EorNot) ty x y))
+(rule 2 (lower (has_type (fits_in_64 ty) (bxor (bnot y) x)))
+      (alu_rs_imm_logic (ALUOp.EorNot) ty x y))
 
-(rule (lower (has_type $I64 (band_not x y)))
-      (alu_rs_imm_logic (ALUOp.AndNot) $I64 x y))
-
-(rule (lower (has_type $I128 (band_not x y))) (i128_alu_bitop (ALUOp.AndNot) $I64 x y))
-
-(rule -2 (lower (has_type (ty_vec128 ty) (band_not x y)))
-      (bic_vec x y (vector_size ty)))
-
-;;;; Rules for `bor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule -1 (lower (has_type (fits_in_32 ty) (bor_not x y)))
-      (alu_rs_imm_logic (ALUOp.OrrNot) ty x y))
-
-(rule (lower (has_type $I64 (bor_not x y)))
-      (alu_rs_imm_logic (ALUOp.OrrNot) $I64 x y))
-
-(rule (lower (has_type $I128 (bor_not x y))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))
-
-;;;; Rules for `bxor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule -1 (lower (has_type (fits_in_32 ty) (bxor_not x y)))
-      (alu_rs_imm_logic (ALUOp.EorNot) $I32 x y))
-
-(rule (lower (has_type $I64 (bxor_not x y)))
-      (alu_rs_imm_logic (ALUOp.EorNot) $I64 x y))
-
-(rule (lower (has_type $I128 (bxor_not x y))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y))
+(rule 3 (lower (has_type $I128 (bxor x (bnot y)))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y))
+(rule 4 (lower (has_type $I128 (bxor (bnot y) x))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y))
 
 ;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -2407,7 +2406,7 @@
 ;; sign extended. We then check if the output sign bit has flipped.
 (rule 0 (lower (has_type (fits_in_16 ty) (iadd_cout a b)))
       (let ((extend ExtendOp (lower_extend_op ty $true))
-            
+
             ;; Instead of emitting two `sxt{b,h}` we do one as an instruction and
             ;; the other as an extend operation in the `add` instruction.
             ;;
@@ -2417,7 +2416,7 @@
             ;; cset    out_carry, ne
             (a_sext Reg (put_in_reg_sext32 a))
             (out Reg (add_extend_op ty a_sext b extend))
-            (out_carry Reg (with_flags_reg 
+            (out_carry Reg (with_flags_reg
                   (cmp_extend (OperandSize.Size32) out out extend)
                   (cset (Cond.Ne)))))
       (output_pair
diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
index 9a7e5faf9e..ef6f8dd379 100644
--- a/cranelift/codegen/src/isa/riscv64/inst.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -1950,32 +1950,14 @@
 
 ;;;
 (decl gen_andn (Reg Reg) Reg)
-(rule 1
-  (gen_andn rs1 rs2)
-  (if-let $true (has_b))
+(rule 1 (gen_andn rs1 rs2)
   (alu_rrr (AluOPRRR.Andn) rs1 rs2))
 
-(rule
-  (gen_andn rs1 rs2)
-  (if-let $false (has_b))
-  (let
-    ((tmp Reg (gen_bit_not rs2)))
-    (alu_and rs1 tmp)))
-
 ;;;
 (decl gen_orn (Reg Reg) Reg)
-(rule 1
-  (gen_orn rs1 rs2 )
-  (if-let $true (has_b))
+(rule 1 (gen_orn rs1 rs2)
   (alu_rrr (AluOPRRR.Orn) rs1 rs2))
 
-(rule
-  (gen_orn rs1 rs2)
-  (if-let $false (has_b))
-  (let
-    ((tmp Reg (gen_bit_not rs2)))
-    (alu_rrr (AluOPRRR.Or) rs1 tmp)))
-
 (decl gen_rev8 (Reg) Reg)
 (rule 1
   (gen_rev8 rs)
@@ -2014,14 +1996,6 @@
       (_ Unit (emit (MInst.Brev8 rs ty step tmp tmp2 rd))))
     (writable_reg_to_reg rd)))
 
-;;; x ^ ~y
-(decl gen_xor_not (Reg Reg) Reg)
-(rule
-  (gen_xor_not x y)
-  (let
-    ((tmp Reg (gen_bit_not y)))
-    (alu_rrr (AluOPRRR.Xor) x tmp)))
-
 ;; Negates x
 ;; Equivalent to 0 - x
 (decl neg (Type ValueRegs) ValueRegs)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 4d169abf5c..fdaa7102c4 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -40,14 +40,14 @@
 (rule 2 (lower (has_type (fits_in_64 ty) (iadd (imm12_from_value x) y)))
   (alu_rr_imm12 (select_addi ty) y x))
 
-(rule 
+(rule
   (lower (has_type $I128 (iadd x y)))
   (let
     ( ;; low part.
       (low Reg (alu_add (value_regs_get x 0) (value_regs_get y 0)))
       ;; compute carry.
       (carry Reg (alu_rrr (AluOPRRR.SltU) low (value_regs_get y 0)))
-      ;; 
+      ;;
       (high_tmp Reg (alu_add (value_regs_get x 1) (value_regs_get y 1)))
       ;; add carry.
       (high Reg (alu_add high_tmp carry)))
@@ -158,19 +158,19 @@
     (alu_rrr (AluOPRRR.Remuw) (ext_int_if_need $false x ty) y2)))
 
 (rule -1 (lower (has_type (fits_in_16 ty) (srem x y)))
-  (let 
+  (let
     ((y2 Reg (ext_int_if_need $true y ty))
       (_ InstOutput (gen_div_by_zero y2)))
     (alu_rrr (AluOPRRR.Remw) (ext_int_if_need $true x ty) y2)))
 
 (rule (lower (has_type $I32 (srem x y)))
-  (let 
+  (let
     ((y2 Reg (ext_int_if_need $true y $I32))
       (_ InstOutput (gen_div_by_zero y2)))
    (alu_rrr (AluOPRRR.Remw) x y2)))
 
 (rule (lower (has_type $I32 (urem x y)))
-  (let 
+  (let
     ((y2 Reg (ext_int_if_need $false y $I32))
         (_ InstOutput (gen_div_by_zero y2)))
     (alu_rrr (AluOPRRR.Remuw) x y2)))
@@ -204,6 +204,29 @@
 (rule (lower (has_type $F64 (band x y)))
   (lower_float_binary (AluOPRRR.And) x y $F64))
 
+;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
+;; by Cranelift's `band_not` instruction that is legalized into the simpler
+;; forms early on.
+
+(rule 3 (lower (has_type (fits_in_64 ty) (band x (bnot y))))
+  (if-let $true (has_b))
+  (gen_andn x y))
+(rule 4 (lower (has_type (fits_in_64 ty) (band (bnot y) x)))
+  (if-let $true (has_b))
+  (gen_andn x y))
+(rule 5 (lower (has_type $I128 (band x (bnot y))))
+  (if-let $true (has_b))
+  (let
+    ((low Reg (gen_andn (value_regs_get x 0) (value_regs_get y 0)))
+      (high Reg (gen_andn (value_regs_get x 1) (value_regs_get y 1))))
+    (value_regs low high)))
+(rule 6 (lower (has_type $I128 (band (bnot y) x)))
+  (if-let $true (has_b))
+  (let
+    ((low Reg (gen_andn (value_regs_get x 0) (value_regs_get y 0)))
+      (high Reg (gen_andn (value_regs_get x 1) (value_regs_get y 1))))
+    (value_regs low high)))
+
 
 ;;;; Rules for `or` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule -1 (lower (has_type (fits_in_64 ty) (bor x y)))
@@ -222,6 +245,30 @@
 (rule (lower (has_type $F64 (bor x y)))
   (lower_float_binary (AluOPRRR.Or) x y $F64))
 
+;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced
+;; by Cranelift's `bor_not` instruction that is legalized into the simpler
+;; forms early on.
+
+(rule 3 (lower (has_type (fits_in_64 ty) (bor x (bnot y))))
+  (if-let $true (has_b))
+  (gen_orn x y))
+(rule 4 (lower (has_type (fits_in_64 ty) (bor (bnot y) x)))
+  (if-let $true (has_b))
+  (gen_orn x y))
+
+(rule 5 (lower (has_type $I128 (bor x (bnot y))))
+  (if-let $true (has_b))
+  (let
+    ((low Reg (gen_orn (value_regs_get x 0) (value_regs_get y 0)))
+      (high Reg (gen_orn (value_regs_get x 1) (value_regs_get y 1))))
+    (value_regs low high)))
+(rule 6 (lower (has_type $I128 (bor (bnot y) x)))
+  (if-let $true (has_b))
+  (let
+    ((low Reg (gen_orn (value_regs_get x 0) (value_regs_get y 0)))
+      (high Reg (gen_orn (value_regs_get x 1) (value_regs_get y 1))))
+    (value_regs low high)))
+
 
 ;;;; Rules for `xor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule -1 (lower (has_type (fits_in_64 ty) (bxor x y)))
@@ -289,15 +336,6 @@
   (lower_extend x $true (ty_bits in) (ty_bits out)))
 
 
-;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule (lower (has_type (fits_in_64 ty) (band_not x y)))
-  (gen_andn x y))
-(rule 1 (lower (has_type $I128 (band_not x y)))
-  (let
-    ((low Reg (gen_andn (value_regs_get x 0) (value_regs_get y 0)))
-      (high Reg (gen_andn (value_regs_get x 1) (value_regs_get y 1))))
-    (value_regs low high)))
-
 ;;;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (fits_in_64 ty) (popcnt x)))
   (lower_popcnt x ty))
@@ -397,29 +435,6 @@
   (lower_i128_rotr x y))
 
 
-;;;; Rules for `bxor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; notice x y order!!!
-(rule (lower (has_type (fits_in_64 ty) (bxor_not x y)))
-  (gen_xor_not x y))
-(rule 1 (lower (has_type $I128 (bxor_not x y)))
-  (let
-    ((low Reg (gen_xor_not (value_regs_get x 0) (value_regs_get y 0)))
-      (high Reg (gen_xor_not (value_regs_get x 1) (value_regs_get y 1))))
-    (value_regs low high)
-  )
-)
-
-;;;; Rules for `bor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule (lower (has_type (fits_in_64 ty) (bor_not x y)))
-  (gen_orn x y))
-
-(rule 1 (lower (has_type $I128 (bor_not x y)))
-  (let
-    ((low Reg (gen_orn (value_regs_get x 0) (value_regs_get y 0)))
-      (high Reg (gen_orn (value_regs_get x 1) (value_regs_get y 1))))
-    (value_regs low high))) 
-
-
 ;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (fits_in_64 ty) (cls x)))
   (lower_cls x ty))
@@ -428,12 +443,12 @@
 
 
 ;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule 
+(rule
   (lower (has_type ty (fabs x)))
   (gen_fabs x ty))
 
 ;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule 
+(rule
   (lower (has_type ty (fneg x)))
   (fpu_rrr (f_copy_neg_sign_op ty) ty x x))
 
@@ -458,35 +473,35 @@
 
 ;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule -1
-  ;; 
-  (lower 
+  ;;
+  (lower
     (has_type (valid_atomic_transaction ty) (atomic_rmw flags op addr x)))
   (gen_atomic (get_atomic_rmw_op ty op) addr x (atomic_amo)))
 
 ;;; for I8 and I16
 (rule 1
-  (lower 
+  (lower
     (has_type (valid_atomic_transaction (fits_in_16 ty)) (atomic_rmw flags op addr x)))
   (gen_atomic_rmw_loop op ty addr x))
 
 ;;;special for I8 and I16 max min etc.
 ;;;because I need uextend or sextend the value.
 (rule 2
-  (lower 
+  (lower
     (has_type (valid_atomic_transaction (fits_in_16 ty)) (atomic_rmw flags (is_atomic_rmw_max_etc op $true) addr x)))
   (gen_atomic_rmw_loop op ty addr (ext_int_if_need $true x ty)))
 
 
 (rule 2
   ;;
-  (lower 
+  (lower
     (has_type (valid_atomic_transaction (fits_in_16 ty)) (atomic_rmw flags (is_atomic_rmw_max_etc op $false) addr x)))
   ;;
   (gen_atomic_rmw_loop op ty addr (ext_int_if_need $false x ty)))
 
 ;;;;;  Rules for `AtomicRmwOp.Sub`
 (rule
-  (lower 
+  (lower
     (has_type (valid_atomic_transaction ty) (atomic_rmw flags (AtomicRmwOp.Sub) addr x)))
   (let
     ((tmp WritableReg (temp_writable_reg ty))
@@ -504,7 +519,7 @@
 
 ;;;;;  Rules for `AtomicRmwOp.Nand`
 (rule
-  (lower 
+  (lower
     (has_type (valid_atomic_transaction ty) (atomic_rmw flags (AtomicRmwOp.Nand) addr x)))
     (gen_atomic_rmw_loop (AtomicRmwOp.Nand) ty addr x))
 
@@ -512,13 +527,13 @@
 (extern extractor is_atomic_rmw_max_etc is_atomic_rmw_max_etc)
 
 ;;;;;  Rules for `atomic load`;;;;;;;;;;;;;;;;;
-(rule 
+(rule
   (lower (has_type (valid_atomic_transaction ty) (atomic_load flags p)))
   (gen_atomic_load p ty))
 
 
 ;;;;;  Rules for `atomic store`;;;;;;;;;;;;;;;;;
-(rule 
+(rule
   (lower (atomic_store flags src @ (value_type (valid_atomic_transaction ty)) p))
   (gen_atomic_store p ty src))
 
@@ -562,37 +577,37 @@
 
 
 ;;;;;  Rules for `for float arithmatic`
-(rule 
+(rule
   (lower (has_type ty (fadd x y)))
   (fpu_rrr (f_arithmatic_op ty (Opcode.Fadd)) ty x y))
-(rule 
+(rule
   (lower (has_type ty (fsub x y)))
   (fpu_rrr (f_arithmatic_op ty (Opcode.Fsub)) ty x y))
-(rule 
+(rule
   (lower (has_type ty (fmul x y)))
   (fpu_rrr (f_arithmatic_op ty (Opcode.Fmul)) ty x y))
-(rule 
+(rule
   (lower (has_type ty (fdiv x y)))
   (fpu_rrr (f_arithmatic_op ty (Opcode.Fdiv)) ty x y))
 
-(rule 
+(rule
   (lower (has_type ty (fmin x y)))
   (gen_float_select (FloatSelectOP.Min) x y ty))
 
-(rule 
+(rule
   (lower (has_type ty (fmin_pseudo x y)))
   (gen_float_select_pseudo (FloatSelectOP.Min) x y ty))
 
-(rule 
+(rule
   (lower (has_type ty (fmax x y)))
   (gen_float_select (FloatSelectOP.Max) x y ty))
 
-(rule 
+(rule
   (lower (has_type ty (fmax_pseudo x y)))
   (gen_float_select_pseudo (FloatSelectOP.Max) x y ty))
 
 ;;;;;  Rules for `stack_addr`;;;;;;;;;
-(rule 
+(rule
   (lower (stack_addr ss offset))
   (gen_stack_addr ss offset))
 
@@ -624,7 +639,7 @@
   (gen_bitselect ty c x y))
 
 ;;;;;  Rules for `isplit`;;;;;;;;;
-(rule 
+(rule
   (lower (isplit x))
   (let
     ((t1 Reg (gen_move2 (value_regs_get x 0) $I64 $I64))
@@ -632,7 +647,7 @@
     (output_pair t1 t2)))
 
 ;;;;;  Rules for `iconcat`;;;;;;;;;
-(rule 
+(rule
   (lower (has_type $I128 (iconcat x y)))
   (let
     ((t1 Reg (gen_move2 x $I64 $I64))
@@ -716,16 +731,16 @@
   (gen_load_128 p offset flags))
 
 ;;;;;  Rules for `istore8`;;;;;;;;;
-(rule 
+(rule
   (lower (istore8 flags x p offset))
   (gen_store p offset (StoreOP.Sb) flags x))
 ;;;;;  Rules for `istore16`;;;;;;;;;
-(rule 
+(rule
   (lower (istore16 flags x p offset))
   (gen_store p offset (StoreOP.Sh) flags x))
 
 ;;;;;  Rules for `istore32`;;;;;;;;;
-(rule 
+(rule
   (lower (istore32 flags x p offset))
   (gen_store p offset (StoreOP.Sw) flags x))
 
@@ -803,22 +818,22 @@
    (gen_moves v in_ty out))
 
 ;;;;;  Rules for `ceil`;;;;;;;;;
-(rule 
+(rule
   (lower (has_type ty (ceil x)))
   (gen_float_round (FloatRoundOP.Ceil) x ty)
 )
 
 ;;;;;  Rules for `floor`;;;;;;;;;
-(rule 
+(rule
   (lower (has_type ty (floor x)))
   (gen_float_round (FloatRoundOP.Floor) x ty))
 ;;;;;  Rules for `trunc`;;;;;;;;;
-(rule 
+(rule
   (lower (has_type ty (trunc x)))
   (gen_float_round (FloatRoundOP.Trunc) x ty))
 
 ;;;;;  Rules for `nearest`;;;;;;;;;
-(rule 
+(rule
   (lower (has_type ty (nearest x)))
   (gen_float_round (FloatRoundOP.Nearest) x ty))
 
diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle
index 9bbbc53f6c..1eed87da07 100644
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -983,6 +983,22 @@
 (rule 0 (lower (has_type (vr128_ty ty) (band x y)))
       (vec_and ty x y))
 
+;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
+;; by Cranelift's `band_not` instruction that is legalized into the simpler
+;; forms early on.
+
+;; z15 version using a single instruction.
+(rule 7 (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (band x (bnot y))))
+      (and_not_reg ty x y))
+(rule 8 (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (band (bnot y) x)))
+      (and_not_reg ty x y))
+
+;; And-not two vector registers.
+(rule 9 (lower (has_type (vr128_ty ty) (band x (bnot y))))
+      (vec_and_not ty x y))
+(rule 10 (lower (has_type (vr128_ty ty) (band (bnot y) x)))
+      (vec_and_not ty x y))
+
 ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Or two registers.
@@ -1009,6 +1025,22 @@
 (rule 0 (lower (has_type (vr128_ty ty) (bor x y)))
       (vec_or ty x y))
 
+;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced
+;; by Cranelift's `bor_not` instruction that is legalized into the simpler
+;; forms early on.
+
+;; z15 version using a single instruction.
+(rule 7 (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bor x (bnot y))))
+      (or_not_reg ty x y))
+(rule 8 (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bor (bnot y) x)))
+      (or_not_reg ty x y))
+
+;; Or-not two vector registers.
+(rule 9 (lower (has_type (vr128_ty ty) (bor x (bnot y))))
+      (vec_or_not ty x y))
+(rule 10 (lower (has_type (vr128_ty ty) (bor (bnot y) x)))
+      (vec_or_not ty x y))
+
 
 ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1032,49 +1064,20 @@
 (rule 0 (lower (has_type (vr128_ty ty) (bxor x y)))
       (vec_xor ty x y))
 
-
-;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Specialized lowerings for `(bxor x (bnot y))` which is additionally produced
+;; by Cranelift's `bxor_not` instruction that is legalized into the simpler
+;; forms early on.
 
 ;; z15 version using a single instruction.
-(rule 2 (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (band_not x y)))
-      (and_not_reg ty x y))
-
-;; z14 version using XOR with -1.
-(rule 1 (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (band_not x y)))
-      (and_reg ty x (not_reg ty y)))
-
-;; And-not two vector registers.
-(rule (lower (has_type (vr128_ty ty) (band_not x y)))
-      (vec_and_not ty x y))
-
-
-;;;; Rules for `bor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; z15 version using a single instruction.
-(rule 2 (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bor_not x y)))
-      (or_not_reg ty x y))
-
-;; z14 version using XOR with -1.
-(rule 1 (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bor_not x y)))
-      (or_reg ty x (not_reg ty y)))
-
-;; Or-not two vector registers.
-(rule (lower (has_type (vr128_ty ty) (bor_not x y)))
-      (vec_or_not ty x y))
-
-
-;;;; Rules for `bxor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; z15 version using a single instruction.
-(rule 2 (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bxor_not x y)))
+(rule 5 (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bxor x (bnot y))))
+      (not_xor_reg ty x y))
+(rule 6 (lower (has_type (and (mie2_enabled) (fits_in_64 ty)) (bxor (bnot y) x)))
       (not_xor_reg ty x y))
 
-;; z14 version using XOR with -1.
-(rule 1 (lower (has_type (and (mie2_disabled) (fits_in_64 ty)) (bxor_not x y)))
-      (not_reg ty (xor_reg ty x y)))
-
 ;; Xor-not two vector registers.
-(rule (lower (has_type (vr128_ty ty) (bxor_not x y)))
+(rule 7 (lower (has_type (vr128_ty ty) (bxor x (bnot y))))
+      (vec_not_xor ty x y))
+(rule 8 (lower (has_type (vr128_ty ty) (bxor (bnot y) x)))
       (vec_not_xor ty x y))
 
 
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 0f01e9cc53..9adc4e2fdb 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -269,6 +269,36 @@
         (value_gprs (x64_and $I64 x_lo y_lo)
                     (x64_and $I64 x_hi y_hi))))
 
+;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
+;; by Cranelift's `band_not` instruction that is legalized into the simpler
+;; forms early on.
+
+(decl sse_and_not (Type Xmm XmmMem) Xmm)
+(rule (sse_and_not $F32X4 x y) (x64_andnps x y))
+(rule (sse_and_not $F64X2 x y) (x64_andnpd x y))
+(rule -1 (sse_and_not (multi_lane _bits _lanes) x y) (x64_pandn x y))
+
+;; Note the flipping of operands below as we're match
+;;
+;;   (band x (bnot y))
+;;
+;; while x86 does
+;;
+;;   pandn(x, y) = and(not(x), y)
+(rule 8 (lower (has_type ty @ (multi_lane _bits _lane) (band x (bnot y))))
+      (sse_and_not ty y x))
+(rule 9 (lower (has_type ty @ (multi_lane _bits _lane) (band (bnot y) x)))
+      (sse_and_not ty y x))
+
+(rule 10 (lower (has_type ty @ (use_bmi1 $true) (band x (bnot y))))
+      (if (ty_int_ref_scalar_64 ty))
+      ;; the first argument is the one that gets inverted with andn
+      (x64_andn ty y x))
+(rule 11 (lower (has_type ty @ (use_bmi1 $true) (band (bnot y) x)))
+      (if (ty_int_ref_scalar_64 ty))
+      (x64_andn ty y x))
+
+
 ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; `{i,b}64` and smaller.
@@ -1085,52 +1115,6 @@
                             (OperandSize.Size32))))
         (x64_pmuludq x2 y2)))
 
-;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(decl sse_and_not (Type Xmm XmmMem) Xmm)
-(rule (sse_and_not $F32X4 x y) (x64_andnps x y))
-(rule (sse_and_not $F64X2 x y) (x64_andnpd x y))
-(rule -1 (sse_and_not (multi_lane _bits _lanes) x y) (x64_pandn x y))
-
-;; Note the flipping of operands below. CLIF specifies
-;;
-;;   band_not(x, y) = and(x, not(y))
-;;
-;; while x86 does
-;;
-;;   pandn(x, y) = and(not(x), y)
-(rule 0 (lower (has_type ty (band_not x y)))
-      (sse_and_not ty y x))
-
-
-(rule 1 (lower (has_type ty @ (use_bmi1 $false) (band_not x y)))
-      (if (ty_int_ref_scalar_64 ty))
-      (x64_and ty 
-               x
-               (x64_not ty y)))
-
-(rule 1 (lower (has_type ty @ (use_bmi1 $true) (band_not x y)))
-      (if (ty_int_ref_scalar_64 ty))
-      ;; the first argument is the one that gets inverted with andn
-      (x64_andn ty y x))
-
-
-;;;; Rules for `bxor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule 0 (lower (has_type ty (bxor_not x y)))
-      (if (ty_int_ref_scalar_64 ty))
-      (x64_xor ty 
-               x
-               (x64_not ty y)))
-
-;;;; Rules for `bor_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(rule 0 (lower (has_type ty (bor_not x y)))
-      (if (ty_int_ref_scalar_64 ty))
-      (x64_or ty 
-               x
-               (x64_not ty y)))
-
 ;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I8X16 (iabs x)))
@@ -1167,7 +1151,7 @@
             ;; it below, since we need to pass it into the cmove
             ;; before we pass the cmove to with_flags_reg.
             (neg_result Gpr (produces_flags_get_reg neg))
-            ;; When the neg instruction sets the sign flag, 
+            ;; When the neg instruction sets the sign flag,
             ;; takes the original (non-negative) value.
             (cmove ConsumesFlags (cmove ty (CC.S) src neg_result)))
         (with_flags_reg (produces_flags_ignore neg) cmove)))
@@ -3586,7 +3570,7 @@
                                          (u8_from_uimm8 lane))))
       (x64_pshufd val lane (OperandSize.Size32)))
 
-;; This is the only remaining case for F64X2 
+;; This is the only remaining case for F64X2
 (rule 1 (lower (has_type $F64 (extractlane val @ (value_type (ty_vec128 ty))
                                          (u8_from_uimm8 1))))
       ;; 0xee == 0b11_10_11_10
diff --git a/cranelift/codegen/src/legalizer/mod.rs b/cranelift/codegen/src/legalizer/mod.rs
index af836a0622..6fa43e0552 100644
--- a/cranelift/codegen/src/legalizer/mod.rs
+++ b/cranelift/codegen/src/legalizer/mod.rs
@@ -224,6 +224,28 @@ pub fn simple_legalize(func: &mut ir::Function, cfg: &mut ControlFlowGraph, isa:
                     pos.func.dfg.replace(inst).icmp(cond, arg, imm);
                 }
 
+                // Legalize the fused bitwise-plus-not instructions into simpler
+                // instructions to assist with optimizations. Lowering will
+                // pattern match this sequence regardless when architectures
+                // support the instruction natively.
+                InstructionData::Binary { opcode, args } => {
+                    match opcode {
+                        ir::Opcode::BandNot => {
+                            let neg = pos.ins().bnot(args[1]);
+                            pos.func.dfg.replace(inst).band(args[0], neg);
+                        }
+                        ir::Opcode::BorNot => {
+                            let neg = pos.ins().bnot(args[1]);
+                            pos.func.dfg.replace(inst).bor(args[0], neg);
+                        }
+                        ir::Opcode::BxorNot => {
+                            let neg = pos.ins().bnot(args[1]);
+                            pos.func.dfg.replace(inst).bxor(args[0], neg);
+                        }
+                        _ => prev_pos = pos.position(),
+                    };
+                }
+
                 _ => {
                     prev_pos = pos.position();
                     continue;
diff --git a/cranelift/codegen/src/opts/algebraic.isle b/cranelift/codegen/src/opts/algebraic.isle
index d889fd8faa..888dd51bbc 100644
--- a/cranelift/codegen/src/opts/algebraic.isle
+++ b/cranelift/codegen/src/opts/algebraic.isle
@@ -281,3 +281,8 @@
 (rule (simplify
        (icmp (ty_int ty) (IntCC.SignedLessThanOrEqual) x x))
       (iconst ty (imm64 1)))
+
+;; (x ^ -1) can be replaced with the `bnot` instruction
+(rule (simplify (bxor ty x (iconst ty k)))
+  (if-let -1 (i64_sextend_imm64 ty k))
+  (bnot ty x))
diff --git a/cranelift/filetests/filetests/egraph/algebraic.clif b/cranelift/filetests/filetests/egraph/algebraic.clif
index ded7e52b0a..1962eb508f 100644
--- a/cranelift/filetests/filetests/egraph/algebraic.clif
+++ b/cranelift/filetests/filetests/egraph/algebraic.clif
@@ -221,3 +221,33 @@ block0(v1: i8):
 
 ; check: v3 = iconst.i8 0
 ; check: return v3
+
+function %bnot1(i8) -> i8 {
+block0(v1: i8):
+    v2 = iconst.i8 -1
+    v3 = bxor v1, v2
+    return v3
+}
+
+; check: v4 = bnot v1
+; check: return v4
+
+function %bnot2(i64) -> i64 {
+block0(v1: i64):
+    v2 = iconst.i64 -1
+    v3 = bxor v1, v2
+    return v3
+}
+
+; check: v4 = bnot v1
+; check: return v4
+
+function %bnot3(i64) -> i64 {
+block0(v1: i64):
+    v2 = iconst.i64 -1
+    v3 = bxor v2, v1
+    return v3
+}
+
+; check: v5 = bnot v1
+; check: return v5
diff --git a/cranelift/filetests/filetests/isa/aarch64/bitopts-optimized.clif b/cranelift/filetests/filetests/isa/aarch64/bitopts-optimized.clif
new file mode 100644
index 0000000000..c46a729001
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/aarch64/bitopts-optimized.clif
@@ -0,0 +1,37 @@
+test compile precise-output
+set unwind_info=false
+set opt_level=speed
+target aarch64
+
+function %band_not_i32_reversed(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    v2 = bnot v0
+    v3 = band v2, v1
+    return v3
+}
+
+; block0:
+;   bic w0, w1, w0
+;   ret
+
+function %bor_not_i32_reversed(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    v2 = bnot v0
+    v3 = bor v2, v1
+    return v3
+}
+
+; block0:
+;   orn w0, w1, w0
+;   ret
+
+function %bxor_not_i32_reversed(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    v2 = bnot v0
+    v3 = bxor v2, v1
+    return v3
+}
+
+; block0:
+;   eon w0, w1, w0
+;   ret
diff --git a/cranelift/filetests/filetests/isa/riscv64/bitops-optimized.clif b/cranelift/filetests/filetests/isa/riscv64/bitops-optimized.clif
new file mode 100644
index 0000000000..1f249ff3ed
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/bitops-optimized.clif
@@ -0,0 +1,45 @@
+test compile precise-output
+set opt_level=speed
+target riscv64 has_b
+
+function %band_not_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = band_not.i32 v0, v1
+  return v2
+}
+
+; block0:
+;   andn a0,a0,a1
+;   ret
+
+function %band_not_i32_reversed(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bnot v0
+  v3 = band v2, v1
+  return v3
+}
+
+; block0:
+;   andn a0,a1,a0
+;   ret
+
+function %bor_not_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bor_not.i32 v0, v1
+  return v2
+}
+
+; block0:
+;   orn a0,a0,a1
+;   ret
+
+function %bor_not_i32_reversed(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bnot v0
+  v3 = bor v2, v1
+  return v3
+}
+
+; block0:
+;   orn a0,a1,a0
+;   ret
diff --git a/cranelift/filetests/filetests/isa/riscv64/bitops.clif b/cranelift/filetests/filetests/isa/riscv64/bitops.clif
index 48697c3a0b..2857ad8a12 100644
--- a/cranelift/filetests/filetests/isa/riscv64/bitops.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/bitops.clif
@@ -631,9 +631,9 @@ block0(v0: i128, v1: i128):
 }
 
 ; block0:
-;   not a2,a2
-;   and a0,a0,a2
+;   not a4,a2
 ;   not a6,a3
+;   and a0,a0,a4
 ;   and a1,a1,a6
 ;   ret
 
@@ -645,9 +645,9 @@ block0(v0: i64):
 }
 
 ; block0:
-;   li t2,4
-;   not a1,t2
-;   and a0,a0,a1
+;   li a1,4
+;   not a2,a1
+;   and a0,a0,a2
 ;   ret
 
 function %band_not_i64_constant_shift(i64, i64) -> i64 {
@@ -660,8 +660,8 @@ block0(v0: i64, v1: i64):
 
 ; block0:
 ;   slli a2,a1,4
-;   not a1,a2
-;   and a0,a0,a1
+;   not a2,a2
+;   and a0,a0,a2
 ;   ret
 
 function %bor_not_i32(i32, i32) -> i32 {
@@ -693,9 +693,9 @@ block0(v0: i128, v1: i128):
 }
 
 ; block0:
-;   not a2,a2
-;   or a0,a0,a2
+;   not a4,a2
 ;   not a6,a3
+;   or a0,a0,a4
 ;   or a1,a1,a6
 ;   ret
 
@@ -707,9 +707,9 @@ block0(v0: i64):
 }
 
 ; block0:
-;   li t2,4
-;   not a1,t2
-;   or a0,a0,a1
+;   li a1,4
+;   not a2,a1
+;   or a0,a0,a2
 ;   ret
 
 function %bor_not_i64_constant_shift(i64, i64) -> i64 {
@@ -722,8 +722,8 @@ block0(v0: i64, v1: i64):
 
 ; block0:
 ;   slli a2,a1,4
-;   not a1,a2
-;   or a0,a0,a1
+;   not a2,a2
+;   or a0,a0,a2
 ;   ret
 
 function %bxor_not_i32(i32, i32) -> i32 {
@@ -755,9 +755,9 @@ block0(v0: i128, v1: i128):
 }
 
 ; block0:
-;   not a2,a2
-;   xor a0,a0,a2
+;   not a4,a2
 ;   not a6,a3
+;   xor a0,a0,a4
 ;   xor a1,a1,a6
 ;   ret
 
@@ -769,9 +769,9 @@ block0(v0: i64):
 }
 
 ; block0:
-;   li t2,4
-;   not a1,t2
-;   xor a0,a0,a1
+;   li a1,4
+;   not a2,a1
+;   xor a0,a0,a2
 ;   ret
 
 function %bxor_not_i64_constant_shift(i64, i64) -> i64 {
@@ -784,8 +784,8 @@ block0(v0: i64, v1: i64):
 
 ; block0:
 ;   slli a2,a1,4
-;   not a1,a2
-;   xor a0,a0,a1
+;   not a2,a2
+;   xor a0,a0,a2
 ;   ret
 
 function %ishl_i128_i8(i128, i8) -> i128 {
diff --git a/cranelift/filetests/filetests/isa/s390x/bitops-optimized.clif b/cranelift/filetests/filetests/isa/s390x/bitops-optimized.clif
new file mode 100644
index 0000000000..091f1a6bf4
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/bitops-optimized.clif
@@ -0,0 +1,66 @@
+test compile precise-output
+set opt_level=speed
+target s390x has_mie2
+
+function %band_not_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = band_not.i32 v0, v1
+  return v2
+}
+
+; block0:
+;   ncrk %r2, %r2, %r3
+;   br %r14
+
+function %band_not_i32_reversed(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bnot v0
+  v3 = band v2, v1
+  return v3
+}
+
+; block0:
+;   ncrk %r2, %r3, %r2
+;   br %r14
+
+function %bor_not_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bor_not.i32 v0, v1
+  return v2
+}
+
+; block0:
+;   ocrk %r2, %r2, %r3
+;   br %r14
+
+function %bor_not_i32_reversed(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bnot v0
+  v3 = bor v2, v1
+  return v3
+}
+
+; block0:
+;   ocrk %r2, %r3, %r2
+;   br %r14
+
+function %bxor_not_i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bxor_not.i32 v0, v1
+  return v2
+}
+
+; block0:
+;   nxrk %r2, %r2, %r3
+;   br %r14
+
+function %bxor_not_i32_reversed(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = bnot v0
+  v3 = bxor v2, v1
+  return v3
+}
+
+; block0:
+;   nxrk %r2, %r3, %r2
+;   br %r14
diff --git a/cranelift/filetests/filetests/isa/s390x/bitwise.clif b/cranelift/filetests/filetests/isa/s390x/bitwise.clif
index a2c1e6ccd2..ffa698326e 100644
--- a/cranelift/filetests/filetests/isa/s390x/bitwise.clif
+++ b/cranelift/filetests/filetests/isa/s390x/bitwise.clif
@@ -366,9 +366,8 @@ block0(v0: i32, v1: i32):
 }
 
 ; block0:
-;   lgr %r5, %r3
-;   xilf %r5, 4294967295
-;   nr %r2, %r5
+;   xilf %r3, 4294967295
+;   nr %r2, %r3
 ;   br %r14
 
 function %band_not_i16(i16, i16) -> i16 {
@@ -378,9 +377,8 @@ block0(v0: i16, v1: i16):
 }
 
 ; block0:
-;   lgr %r5, %r3
-;   xilf %r5, 4294967295
-;   nr %r2, %r5
+;   xilf %r3, 4294967295
+;   nr %r2, %r3
 ;   br %r14
 
 function %band_not_i8(i8, i8) -> i8 {
@@ -390,9 +388,8 @@ block0(v0: i8, v1: i8):
 }
 
 ; block0:
-;   lgr %r5, %r3
-;   xilf %r5, 4294967295
-;   nr %r2, %r5
+;   xilf %r3, 4294967295
+;   nr %r2, %r3
 ;   br %r14
 
 function %bor_not_i128(i128, i128) -> i128 {
@@ -427,9 +424,8 @@ block0(v0: i32, v1: i32):
 }
 
 ; block0:
-;   lgr %r5, %r3
-;   xilf %r5, 4294967295
-;   or %r2, %r5
+;   xilf %r3, 4294967295
+;   or %r2, %r3
 ;   br %r14
 
 function %bor_not_i16(i16, i16) -> i16 {
@@ -439,9 +435,8 @@ block0(v0: i16, v1: i16):
 }
 
 ; block0:
-;   lgr %r5, %r3
-;   xilf %r5, 4294967295
-;   or %r2, %r5
+;   xilf %r3, 4294967295
+;   or %r2, %r3
 ;   br %r14
 
 function %bor_not_i8(i8, i8) -> i8 {
@@ -451,9 +446,8 @@ block0(v0: i8, v1: i8):
 }
 
 ; block0:
-;   lgr %r5, %r3
-;   xilf %r5, 4294967295
-;   or %r2, %r5
+;   xilf %r3, 4294967295
+;   or %r2, %r3
 ;   br %r14
 
 function %bxor_not_i128(i128, i128) -> i128 {
@@ -476,9 +470,9 @@ block0(v0: i64, v1: i64):
 }
 
 ; block0:
+;   xilf %r3, 4294967295
+;   xihf %r3, 4294967295
 ;   xgr %r2, %r3
-;   xilf %r2, 4294967295
-;   xihf %r2, 4294967295
 ;   br %r14
 
 function %bxor_not_i32(i32, i32) -> i32 {
@@ -488,8 +482,8 @@ block0(v0: i32, v1: i32):
 }
 
 ; block0:
+;   xilf %r3, 4294967295
 ;   xr %r2, %r3
-;   xilf %r2, 4294967295
 ;   br %r14
 
 function %bxor_not_i16(i16, i16) -> i16 {
@@ -499,8 +493,8 @@ block0(v0: i16, v1: i16):
 }
 
 ; block0:
+;   xilf %r3, 4294967295
 ;   xr %r2, %r3
-;   xilf %r2, 4294967295
 ;   br %r14
 
 function %bxor_not_i8(i8, i8) -> i8 {
@@ -510,8 +504,8 @@ block0(v0: i8, v1: i8):
 }
 
 ; block0:
+;   xilf %r3, 4294967295
 ;   xr %r2, %r3
-;   xilf %r2, 4294967295
 ;   br %r14
 
 function %bnot_i128(i128) -> i128 {
diff --git a/cranelift/filetests/filetests/isa/x64/band_not_bmi1.clif b/cranelift/filetests/filetests/isa/x64/band_not_bmi1.clif
index 6c448f42bb..29101eaff3 100644
--- a/cranelift/filetests/filetests/isa/x64/band_not_bmi1.clif
+++ b/cranelift/filetests/filetests/isa/x64/band_not_bmi1.clif
@@ -1,4 +1,5 @@
 test compile precise-output
+set opt_level=speed
 target x86_64 has_bmi1
 
 function %f1(i8, i8) -> i8 {
@@ -15,3 +16,19 @@ block0(v0: i8, v1: i8):
 ;   popq    %rbp
 ;   ret
 
+
+function %reversed_operands(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+    v2 = bnot v0
+    v3 = band v2, v1
+    return v3
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   andn    %eax, %edi, %esi
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
diff --git a/cranelift/filetests/filetests/wasm/i32-not-x64.wat b/cranelift/filetests/filetests/wasm/i32-not-x64.wat
new file mode 100644
index 0000000000..45d3798e74
--- /dev/null
+++ b/cranelift/filetests/filetests/wasm/i32-not-x64.wat
@@ -0,0 +1,46 @@
+;;!target = "x86_64"
+;;!compile = true
+;;!settings = ["opt_level=speed", "has_bmi1=true"]
+
+(module
+  ;; this should get optimized to a `bnot` in clif
+  (func (param i32) (result i32)
+    i32.const -1
+    local.get 0
+    i32.xor)
+
+  ;; this should get optimized to a single `andn` instruction
+  (func (param i32 i32) (result i32)
+    local.get 0
+    i32.const -1
+    local.get 1
+    i32.xor
+    i32.and)
+)
+
+;; function u0:0:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   jmp     label1
+;; block1:
+;;   movq    %rdi, %rax
+;;   notl    %eax, %eax
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret
+;;
+;; function u0:1:
+;;   pushq   %rbp
+;;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
+;;   movq    %rsp, %rbp
+;;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
+;; block0:
+;;   jmp     label1
+;; block1:
+;;   andn    %eax, %esi, %edi
+;;   movq    %rbp, %rsp
+;;   popq    %rbp
+;;   ret