diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index e6dceabff1..f08d6ad5a7 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1255,6 +1255,26 @@
             (b Xmm (sse_and_not ty cond_xmm if_false)))
         (sse_or ty b a)))
 
+;; If every byte of the condition is guaranteed to be all ones or all zeroes,
+;; we can use x86_blend like vselect does.
+(rule 1 (lower (has_type ty @ (multi_lane _bits _lanes)
+                         (bitselect condition
+                                    if_true
+                                    if_false)))
+      (if (all_ones_or_all_zeros condition))
+      (x64_blend ty
+                 condition
+                 if_true
+                 if_false))
+
+(decl pure all_ones_or_all_zeros (Value) bool)
+(rule (all_ones_or_all_zeros (and (icmp _ _ _) (value_type (multi_lane _ _)))) $true)
+(rule (all_ones_or_all_zeros (and (fcmp _ _ _) (value_type (multi_lane _ _)))) $true)
+(rule (all_ones_or_all_zeros (vconst (vconst_all_ones_or_all_zeros))) $true)
+
+(decl pure vconst_all_ones_or_all_zeros () Constant)
+(extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros)
+
 ;;;; Rules for `vselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type ty @ (multi_lane _bits _lanes)
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index 6258fb6d03..17776d289e 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -713,6 +713,15 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         targets.len() as u32
     }
 
+    #[inline]
+    fn vconst_all_ones_or_all_zeros(&mut self, constant: Constant) -> Option<()> {
+        let const_data = self.lower_ctx.get_constant_data(constant);
+        if const_data.iter().all(|&b| b == 0 || b == 0xFF) {
+            return Some(());
+        }
+        None
+    }
+
     #[inline]
     fn fcvt_uint_mask_const(&mut self) -> VCodeConstant {
         self.lower_ctx
diff --git a/cranelift/codegen/src/simple_preopt.rs b/cranelift/codegen/src/simple_preopt.rs
index d107f1554c..08c3660213 100644
--- a/cranelift/codegen/src/simple_preopt.rs
+++ b/cranelift/codegen/src/simple_preopt.rs
@@ -826,67 +826,6 @@ mod simplify {
                 }
             }
 
-            InstructionData::Ternary {
-                opcode: Opcode::Bitselect,
-                args,
-            } => {
-                let old_cond_type = pos.func.dfg.value_type(args[0]);
-                if !old_cond_type.is_vector() {
-                    return;
-                }
-
-                // Replace bitselect with vselect if each lane of controlling mask is either
-                // all ones or all zeroes; on x86 bitselect is encoded using 3 instructions,
-                // while vselect can be encoded using single BLEND instruction.
-                if let ValueDef::Result(def_inst, _) = pos.func.dfg.value_def(args[0]) {
-                    let (cond_val, cond_type) = match pos.func.dfg[def_inst] {
-                        InstructionData::IntCompare { .. }
-                        | InstructionData::FloatCompare { .. } => {
-                            // If the controlled mask is from a comparison, the value will be all
-                            // zeros or ones in each output lane.
-                            let arg = args[0];
-                            let arg_type = pos.func.dfg.value_type(arg);
-                            if !arg_type.is_vector() {
-                                return;
-                            }
-                            (arg, arg_type)
-                        }
-                        InstructionData::UnaryConst {
-                            opcode: Opcode::Vconst,
-                            constant_handle,
-                        } => {
-                            // If each byte of controlling mask is 0x00 or 0xFF then
-                            // we will always bitcast our way to vselect(I8x16, I8x16).
-                            // Bitselect operates at bit level, so the lane types don't matter.
-                            let const_data = pos.func.dfg.constants.get(constant_handle);
-                            if !const_data.iter().all(|&b| b == 0 || b == 0xFF) {
-                                return;
-                            }
-                            let new_type = I8.by(old_cond_type.bytes()).unwrap();
-                            (pos.ins().bitcast(new_type, args[0]), new_type)
-                        }
-                        _ => return,
-                    };
-
-                    let lane_type = Type::int(cond_type.lane_bits() as u16).unwrap();
-                    let arg_type = lane_type.by(cond_type.lane_count()).unwrap();
-                    let old_arg_type = pos.func.dfg.value_type(args[1]);
-
-                    if arg_type != old_arg_type {
-                        // Operands types must match, we need to add bitcasts.
-                        let arg1 = pos.ins().bitcast(arg_type, args[1]);
-                        let arg2 = pos.ins().bitcast(arg_type, args[2]);
-                        let ret = pos.ins().vselect(cond_val, arg1, arg2);
-                        pos.func.dfg.replace(inst).bitcast(old_arg_type, ret);
-                    } else {
-                        pos.func
-                            .dfg
-                            .replace(inst)
-                            .vselect(cond_val, args[1], args[2]);
-                    }
-                }
-            }
-
             _ => {}
         }
     }
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
new file mode 100644
index 0000000000..edcc1f2771
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitselect.clif
@@ -0,0 +1,123 @@
+test compile precise-output
+set enable_simd
+target x86_64 skylake
+
+function %mask_from_icmp(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = icmp eq v0, v1
+    v3 = bitselect v2, v0, v1
+    return v3
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pcmpeqb %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm0, %xmm8
+;   movdqa  %xmm5, %xmm0
+;   movdqa  %xmm1, %xmm6
+;   pblendvb %xmm6, %xmm8, %xmm6
+;   movdqa  %xmm6, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %mask_from_fcmp(f32x4, f32x4, i32x4, i32x4) -> i32x4  {
+block0(v0: f32x4, v1: f32x4, v2: i32x4, v3: i32x4):
+    v4 = fcmp eq v0, v1
+    v5 = bitselect v4, v2, v3
+    return v5
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cmpps   $0, %xmm0, %xmm1, %xmm0
+;   movdqa  %xmm3, %xmm8
+;   pblendvb %xmm8, %xmm2, %xmm8
+;   movdqa  %xmm8, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
+block0(v0: i8x16, v1: i8x16, v2: i32x4):
+    v3 = bitcast.i8x16 v2
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm5
+;   pand    %xmm5, %xmm2, %xmm5
+;   movdqa  %xmm2, %xmm0
+;   pandn   %xmm0, %xmm1, %xmm0
+;   por     %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm6
+;   movdqu  const(0), %xmm0
+;   movdqa  %xmm6, %xmm8
+;   movdqa  %xmm1, %xmm6
+;   pblendvb %xmm6, %xmm8, %xmm6
+;   movdqa  %xmm6, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF]
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqa  %xmm0, %xmm6
+;   movdqu  const(0), %xmm0
+;   movdqa  %xmm6, %xmm8
+;   movdqa  %xmm1, %xmm6
+;   pblendvb %xmm6, %xmm8, %xmm6
+;   movdqa  %xmm6, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %bad_const_mask(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movdqu  const(0), %xmm6
+;   movdqa  %xmm6, %xmm9
+;   movdqa  %xmm0, %xmm5
+;   pand    %xmm5, %xmm9, %xmm5
+;   movdqa  %xmm9, %xmm0
+;   pandn   %xmm0, %xmm1, %xmm0
+;   por     %xmm0, %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
index 231c2fc9e4..c35942613a 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -128,28 +128,6 @@ block0(v0: i32x4, v1: i32x4):
 ;   popq    %rbp
 ;   ret
 
-function %bitselect_i16x8() -> i16x8 {
-block0:
-    v0 = vconst.i16x8 [0 0 0 0 0 0 0 0]
-    v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
-    v2 = vconst.i16x8 [0 0 0 0 0 0 0 0]
-    v3 = bitselect v0, v1, v2
-    return v3
-}
-
-;   pushq   %rbp
-;   movq    %rsp, %rbp
-; block0:
-;   movdqu  const(0), %xmm0
-;   movdqu  const(0), %xmm2
-;   movdqu  const(0), %xmm6
-;   pand    %xmm2, %xmm0, %xmm2
-;   pandn   %xmm0, %xmm6, %xmm0
-;   por     %xmm0, %xmm2, %xmm0
-;   movq    %rbp, %rsp
-;   popq    %rbp
-;   ret
-
 function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8, v2: i16x8):
     v3 = vselect v0, v1, v2
diff --git a/cranelift/filetests/filetests/simple_preopt/bitselect.clif b/cranelift/filetests/filetests/simple_preopt/bitselect.clif
deleted file mode 100644
index b7ba46f5c0..0000000000
--- a/cranelift/filetests/filetests/simple_preopt/bitselect.clif
+++ /dev/null
@@ -1,52 +0,0 @@
-test simple_preopt
-target aarch64
-target x86_64
-
-;; Test replacement of bitselect with vselect for special masks
-
-function %mask_from_icmp(i8x16, i8x16) -> i8x16 {
-block0(v0: i8x16, v1: i8x16):
-    v2 = icmp eq v0, v1
-    v3 = bitselect v2, v0, v1
-    ; check: v3 = vselect v2, v0, v1
-    return v3
-}
-
-;; We can't guarantee that the i32x4 has all ones or zeros in each lane, so we
-;; can't remove the bitselect in this case.
-function %mask_casted(i8x16, i8x16, i32x4) -> i8x16 {
-block0(v0: i8x16, v1: i8x16, v2: i32x4):
-    v3 = bitcast.i8x16 v2
-    v4 = bitselect v3, v0, v1
-    ; check: v4 = bitselect v3, v0, v1
-    return v4
-}
-
-function %good_const_mask_i8x16(i8x16, i8x16) -> i8x16 {
-block0(v0: i8x16, v1: i8x16):
-    v3 = vconst.i8x16 [0 0 0xFF 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
-    v4 = bitselect v3, v0, v1
-    ; check:  v5 = bitcast.i8x16 v3
-    ; nextln: v4 = vselect v5, v0, v1
-    return v4
-}
-
-function %good_const_mask_i16x8(i16x8, i16x8) -> i16x8 {
-block0(v0: i16x8, v1: i16x8):
-    v3 = vconst.i16x8 [0x0000 0xFF00 0x0000 0x00FF 0x0000 0xFFFF 0x00FF 0xFFFF]
-    v4 = bitselect v3, v0, v1
-    ; check:  v5 = bitcast.i8x16 v3
-    ; nextln: v6 = bitcast.i8x16 v0
-    ; nextln: v7 = bitcast.i8x16 v1
-    ; nextln: v8 = vselect v5, v6, v7
-    ; nextln: v4 = bitcast.i16x8 v8
-    return v4
-}
-
-function %bad_const_mask(i8x16, i8x16) -> i8x16 {
-block0(v0: i8x16, v1: i8x16):
-    v3 = vconst.i8x16 [0 0 0xF0 0 0 0xFF 0 0 0 0 0xFF 0 0 0 0 0xFF]
-    v4 = bitselect v3, v0, v1
-    ; check: v4 = bitselect v3, v0, v1
-    return v4
-}