Merge pull request #3653 from fitzgen/bitselect-isle

cranelift: Port `bitselect` over to ISLE on x64
2022-01-06 13:06:43 -08:00
parent ff533dc7d4 b60a4df2af
commit 7fd78da23f
9 changed files with 408 additions and 317 deletions
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -1301,36 +1301,6 @@ impl Inst {
        }
    }

-    /// Choose which instruction to use for computing a bitwise AND on two values.
-    pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
-        match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pand, from, to),
-            _ => unimplemented!("unimplemented type for Inst::and: {}", ty),
-        }
-    }
-
-    /// Choose which instruction to use for computing a bitwise AND NOT on two values.
-    pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
-        match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pandn, from, to),
-            _ => unimplemented!("unimplemented type for Inst::and_not: {}", ty),
-        }
-    }
-
-    /// Choose which instruction to use for computing a bitwise OR on two values.
-    pub(crate) fn or(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
-        match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Por, from, to),
-            _ => unimplemented!("unimplemented type for Inst::or: {}", ty),
-        }
-    }
-
    /// Translate three-operand instructions into a sequence of two-operand
    /// instructions.
    ///
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -360,18 +360,14 @@

 ;; SSE.

-(rule (lower (has_type $F32X4 (band x y)))
-      (value_reg (andps (put_in_reg x)
-                        (put_in_reg_mem y))))
+(decl sse_and (Type Reg RegMem) Reg)
+(rule (sse_and $F32X4 x y) (andps x y))
+(rule (sse_and $F64X2 x y) (andpd x y))
+(rule (sse_and (multi_lane _bits _lanes) x y) (pand x y))

-(rule (lower (has_type $F64X2 (band x y)))
-      (value_reg (andpd (put_in_reg x)
-                        (put_in_reg_mem y))))
-
-(rule (lower (has_type (multi_lane _bits _lanes)
+(rule (lower (has_type ty @ (multi_lane _bits _lanes)
                       (band x y)))
-      (value_reg (pand (put_in_reg x)
-                       (put_in_reg_mem y))))
+      (value_reg (sse_and ty (put_in_reg x) (put_in_reg_mem y))))

 ;; `{i,b}128`.

@@ -436,18 +432,14 @@

 ;; SSE.

-(rule (lower (has_type $F32X4 (bor x y)))
-      (value_reg (orps (put_in_reg x)
-                       (put_in_reg_mem y))))
+(decl sse_or (Type Reg RegMem) Reg)
+(rule (sse_or $F32X4 x y) (orps x y))
+(rule (sse_or $F64X2 x y) (orpd x y))
+(rule (sse_or (multi_lane _bits _lanes) x y) (por x y))

-(rule (lower (has_type $F64X2 (bor x y)))
-      (value_reg (orpd (put_in_reg x)
-                       (put_in_reg_mem y))))
-
-(rule (lower (has_type (multi_lane _bits _lanes)
+(rule (lower (has_type ty @ (multi_lane _bits _lanes)
                       (bor x y)))
-      (value_reg (por (put_in_reg x)
-                      (put_in_reg_mem y))))
+      (value_reg (sse_or ty (put_in_reg x) (put_in_reg_mem y))))

 ;; `{i,b}128`.

@@ -960,6 +952,11 @@

 ;;;; Rules for `band_not` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+(decl sse_and_not (Type Reg RegMem) Reg)
+(rule (sse_and_not $F32X4 x y) (andnps x y))
+(rule (sse_and_not $F64X2 x y) (andnpd x y))
+(rule (sse_and_not (multi_lane _bits _lanes) x y) (pandn x y))
+
 ;; Note the flipping of operands below. CLIF specifies
 ;;
 ;;   band_not(x, y) = and(x, not(y))
@@ -967,15 +964,10 @@
 ;; while x86 does
 ;;
 ;;   pandn(x, y) = and(not(x), y)
-
-(rule (lower (has_type $F32X4 (band_not x y)))
-      (value_reg (andnps (put_in_reg y) (put_in_reg_mem x))))
-
-(rule (lower (has_type $F64X2 (band_not x y)))
-      (value_reg (andnpd (put_in_reg y) (put_in_reg_mem x))))
-
-(rule (lower (has_type (multi_lane _bits _lanes) (band_not x y)))
-      (value_reg (pandn (put_in_reg y) (put_in_reg_mem x))))
+(rule (lower (has_type ty (band_not x y)))
+      (value_reg (sse_and_not ty
+                              (put_in_reg y)
+                              (put_in_reg_mem x))))

 ;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@@ -1044,6 +1036,20 @@
 (rule (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x)))
      (value_reg (sse_xor ty (put_in_reg x) (RegMem.Reg (vector_all_ones ty)))))

+;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane _bits _lanes)
+                       (bitselect condition
+                                  if_true
+                                  if_false)))
+      ;; a = and if_true, condition
+      ;; b = and_not condition, if_false
+      ;; or b, a
+      (let ((cond_reg Reg (put_in_reg condition))
+            (a Reg (sse_and ty (put_in_reg if_true) (RegMem.Reg cond_reg)))
+            (b Reg (sse_and_not ty cond_reg (put_in_reg_mem if_false))))
+        (value_reg (sse_or ty b (RegMem.Reg a)))))
+
 ;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower (insertlane vec @ (value_type ty) val (u8_from_uimm8 idx)))
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1534,30 +1534,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        | Opcode::Umax
        | Opcode::Imin
        | Opcode::Umin
-        | Opcode::Bnot => implemented_in_isle(ctx),
-
-        Opcode::Bitselect => {
-            let ty = ty.unwrap();
-            let condition = put_input_in_reg(ctx, inputs[0]);
-            let if_true = put_input_in_reg(ctx, inputs[1]);
-            let if_false = input_to_reg_mem(ctx, inputs[2]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            if ty.is_vector() {
-                let tmp1 = ctx.alloc_tmp(ty).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(tmp1, if_true, ty));
-                ctx.emit(Inst::and(ty, RegMem::reg(condition.clone()), tmp1));
-
-                let tmp2 = ctx.alloc_tmp(ty).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(tmp2, condition, ty));
-                ctx.emit(Inst::and_not(ty, if_false, tmp2));
-
-                ctx.emit(Inst::gen_move(dst, tmp2.to_reg(), ty));
-                ctx.emit(Inst::or(ty, RegMem::from(tmp1), dst));
-            } else {
-                unimplemented!("no lowering for scalar bitselect instruction")
-            }
-        }
+        | Opcode::Bnot
+        | Opcode::Bitselect => implemented_in_isle(ctx),

        Opcode::Vselect => {
            let ty = ty.unwrap();
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
 src/prelude.isle babc931e5dc5b4cf
 src/isa/x64/inst.isle fb5d3ac8e68c46d2
-src/isa/x64/lower.isle d39e01add89178d5
+src/isa/x64/lower.isle 5d66b88a371d4d70
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -2,6 +2,87 @@ test compile
 set enable_simd
 target x86_64 skylake

+function %band_f32x4(f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4):
+    v2 = band v0, v1
+    return v2
+}
+; check: andps
+; not: andpd
+; not: pand
+
+function %band_f64x2(f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2):
+    v2 = band v0, v1
+    return v2
+}
+; check: andpd
+; not: andps
+; not: pand
+
+function %band_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = band v0, v1
+    return v2
+}
+; check: pand
+; not: andps
+; not: andpd
+
+function %bor_f32x4(f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4):
+    v2 = bor v0, v1
+    return v2
+}
+; check: orps
+; not: orpd
+; not: por
+
+function %bor_f64x2(f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2):
+    v2 = bor v0, v1
+    return v2
+}
+; check: orpd
+; not: orps
+; not: por
+
+function %bor_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bor v0, v1
+    return v2
+}
+; check: por
+; not: orps
+; not: orpd
+
+function %bxor_f32x4(f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4):
+    v2 = bxor v0, v1
+    return v2
+}
+; check: xorps
+; not: xorpd
+; not: pxor
+
+function %bxor_f64x2(f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2):
+    v2 = bxor v0, v1
+    return v2
+}
+; check: xorpd
+; not: xorps
+; not: pxor
+
+function %bxor_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = bxor v0, v1
+    return v2
+}
+; check: pxor
+; not: xorps
+; not: xorpd
+
 function %bitselect_i16x8() -> i16x8 {
 block0:
    v0 = vconst.i16x8 [0 0 0 0 0 0 0 0]
--- a/cranelift/filetests/filetests/runtests/simd-bitselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitselect.clif
@@ -0,0 +1,14 @@
+test run
+set enable_simd
+target aarch64
+target x86_64
+
+function %bitselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4, v2: i32x4):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+; run: %bitselect_i32x4(0x00000000000000000000000000000000, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %bitselect_i32x4(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111
+; run: %bitselect_i32x4(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
+; run: %bitselect_i32x4(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000
--- a/crates/wasmtime/src/func.rs
+++ b/crates/wasmtime/src/func.rs
@@ -1459,7 +1459,7 @@ macro_rules! impl_wasm_host_results {
            fn func_type(params: impl Iterator<Item = ValType>) -> FuncType {
                FuncType::new(
                    params,
-                    std::array::IntoIter::new([$($t::valtype(),)*]),
+                    IntoIterator::into_iter([$($t::valtype(),)*]),
                )
            }

--- a/tests/all/limits.rs
+++ b/tests/all/limits.rs
@@ -28,7 +28,7 @@ fn test_limits() -> Result<()> {
    let instance = Instance::new(&mut store, &module, &[])?;

    // Test instance exports and host objects hitting the limit
-    for memory in std::array::IntoIter::new([
+    for memory in IntoIterator::into_iter([
        instance.get_memory(&mut store, "m").unwrap(),
        Memory::new(&mut store, MemoryType::new(0, None))?,
    ]) {
@@ -46,7 +46,7 @@ fn test_limits() -> Result<()> {
    }

    // Test instance exports and host objects hitting the limit
-    for table in std::array::IntoIter::new([
+    for table in IntoIterator::into_iter([
        instance.get_table(&mut store, "t").unwrap(),
        Table::new(
            &mut store,
@@ -137,7 +137,7 @@ async fn test_limits_async() -> Result<()> {
    let instance = Instance::new_async(&mut store, &module, &[]).await?;

    // Test instance exports and host objects hitting the limit
-    for memory in std::array::IntoIter::new([
+    for memory in IntoIterator::into_iter([
        instance.get_memory(&mut store, "m").unwrap(),
        Memory::new_async(&mut store, MemoryType::new(0, None)).await?,
    ]) {
@@ -156,7 +156,7 @@ async fn test_limits_async() -> Result<()> {
    }

    // Test instance exports and host objects hitting the limit
-    for table in std::array::IntoIter::new([
+    for table in IntoIterator::into_iter([
        instance.get_table(&mut store, "t").unwrap(),
        Table::new_async(
            &mut store,
@@ -201,7 +201,7 @@ fn test_limits_memory_only() -> Result<()> {
    let instance = Instance::new(&mut store, &module, &[])?;

    // Test instance exports and host objects hitting the limit
-    for memory in std::array::IntoIter::new([
+    for memory in IntoIterator::into_iter([
        instance.get_memory(&mut store, "m").unwrap(),
        Memory::new(&mut store, MemoryType::new(0, None))?,
    ]) {
@@ -219,7 +219,7 @@ fn test_limits_memory_only() -> Result<()> {
    }

    // Test instance exports and host objects *not* hitting the limit
-    for table in std::array::IntoIter::new([
+    for table in IntoIterator::into_iter([
        instance.get_table(&mut store, "t").unwrap(),
        Table::new(
            &mut store,
@@ -282,7 +282,7 @@ fn test_limits_table_only() -> Result<()> {
    let instance = Instance::new(&mut store, &module, &[])?;

    // Test instance exports and host objects *not* hitting the limit
-    for memory in std::array::IntoIter::new([
+    for memory in IntoIterator::into_iter([
        instance.get_memory(&mut store, "m").unwrap(),
        Memory::new(&mut store, MemoryType::new(0, None))?,
    ]) {
@@ -293,7 +293,7 @@ fn test_limits_table_only() -> Result<()> {
    }

    // Test instance exports and host objects hitting the limit
-    for table in std::array::IntoIter::new([
+    for table in IntoIterator::into_iter([
        instance.get_table(&mut store, "t").unwrap(),
        Table::new(
            &mut store,