Support i1 datatype with an experimental flag. (iree-org#18713)

lialan · giacs-epic · commit 6241825380b3 · 2024-12-04T14:52:50.000Z
Enable packed i1 datatype storage

This commit introduces support for packed storage of the `i1` (bit)
datatype. When subbyte type packing is enabled via the
`--iree-experimental-packed-i1-storage` option, vectors of `i1` elements
will be stored in a compact packed representation.

For example, a `vector&lt;6xi1&gt;` will occupy a single byte of memory with
the 6 bit elements packed together and 2 padding bits. A
`vector&lt;3x3xi1&gt;` will take up 2 bytes, with the 9 bit elements packed
across the bytes and 7 padding bits.

Limitations:
- To ensure correct behavior, the tiling configuration aligns the
innermost dimension data loads with byte boundaries. This is
necessitated by the current lack of emulation for unaligned subbyte
vector loading/storing.
- Unaligned subbyte emulation support can be added in the future, though
it may incur some performance overhead.

This change requires corresponding updates in the frontend to utilize
the packed `i1` storage format.

Signed-off-by: Alan Li &lt;me@alanli.org&gt;
Signed-off-by: Giacomo Serafini &lt;179146510+giacs-epic@users.noreply.github.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -2915,6 +2915,26 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
     }
   }
 
+  // Make sure the innermost tile size times element size is multiple
+  // of byte bits. This is required for now because we do not fully
+  // support sub-byte vector stores. Once vector stores are supported
+  // then this can be eliminated. Note that emulating sub-byte sized vector
+  // loads and stores will have a performance impact.
+  auto resultTypes = rootOperation->getResultTypes();
+  if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
+    auto elementTypeSize =
+        cast<ShapedType>(rootOperation->getResultTypes().front())
+            .getElementType()
+            .getIntOrFloatBitWidth();
+    // for now just enable for i1
+    if (elementTypeSize == 1) {
+      auto innermostTileSize = commonVecTileSizes.back();
+      commonVecTileSizes.back() =
+          llvm::alignTo(innermostTileSize * elementTypeSize, 8) /
+          elementTypeSize;
+    }
+  }
+
   // Set the lowering configs with new tile sizes.
   for (auto op : computeOps) {
     int numLoops = cast<TilingInterface>(op).getLoopIteratorTypes().size();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -1958,3 +1958,28 @@ func.func @test_tiling_cpu_default(%arg0: tensor<256x256xi8>, %arg1: tensor<256x
 //      CHECK: func @test_tiling_cpu_default(
 // CHECK-SAME:     translation_info = #[[TRANSLATION_INFO]]
 //      CHECK:    linalg.quantized_matmul {lowering_config = #[[CONFIG0]]}
+
+// -----
+
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+func.func @i1_type()  attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
+  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
+  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<8xi1>>
+  %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<8xi1>
+  %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<8xi1>
+  %5 = tensor.empty() : tensor<8xi1>
+  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<8xi1>, tensor<8xi1>) outs(%5 : tensor<8xi1>) {
+  ^bb0(%in: i1, %in_0: i1, %out: i1):
+    %7 = arith.xori %in, %in_0 : i1
+    linalg.yield %7 : i1
+  } -> tensor<8xi1>
+  flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [8], strides = [1] : tensor<8xi1> -> !flow.dispatch.tensor<writeonly:tensor<8xi1>>
+  return
+}
+
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8], [8], [0], [0]]>
+// CHECK: func @i1_type()
+// CHECK: linalg.generic {
+// CHECK-SAME: {lowering_config = #[[CONFIG]]}
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
@@ -29,6 +29,7 @@ iree_lit_test_suite(
             "encode_device_tensors_packing.mlir",
             "encode_host_tensors.mlir",
             "encode_host_tensors_packing.mlir",
+            "encode_host_tensors_packing_i1.mlir",
             "fold_globals.mlir",
             "fold_uniform_operands.mlir",
             "fuse_dispatch_bindings.mlir",
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
@@ -27,6 +27,7 @@ iree_lit_test_suite(
     "encode_device_tensors_packing.mlir"
     "encode_host_tensors.mlir"
     "encode_host_tensors_packing.mlir"
+    "encode_host_tensors_packing_i1.mlir"
     "fold_globals.mlir"
     "fold_uniform_operands.mlir"
     "fuse_dispatch_bindings.mlir"
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir
@@ -85,7 +85,7 @@ util.func public @denseTensorSizeOfDynamic(%arg0: index) -> index {
   // CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
   // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
   // CHECK: %[[MUL:.+]] = arith.muli %arg0, %[[C5]] : index
-  // CHECK: %[[DIV:.+]] = arith.divui %[[MUL]], %[[C2]] : index
+  // CHECK: %[[DIV:.+]] = arith.ceildivui %[[MUL]], %[[C2]] : index
   %0 = stream.tensor.sizeof tensor<?x5xi4>{%arg0} : index
   // CHECK: util.return %[[DIV]]
   util.return %0 : index
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1.mlir
@@ -0,0 +1,20 @@
+// RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --iree-experimental-packed-i1-storage %s | FileCheck %s
+
+func.func @unaligned_i1_size() -> index {
+  %0 = stream.tensor.sizeof tensor<12xi1> : index
+  return %0 : index
+}
+// CHECK: func @unaligned_i1_size() -> index {
+// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
+// CHECK: return %[[C2]] : index
+
+// -----
+
+func.func @aligned_i1_size() -> index {
+  %0 = stream.tensor.sizeof tensor<24xi1> : index
+  return %0 : index
+}
+
+// CHECK: func @aligned_i1_size() -> index {
+// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
+// CHECK: return %[[C3]] : index
diff --git a/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp b/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp
@@ -15,7 +15,17 @@
 
 namespace mlir::iree_compiler {
 
+llvm::cl::opt<bool> clEnableI1Support(
+    "iree-experimental-packed-i1-storage",
+    llvm::cl::desc(
+        "Experimental feature: enable i1 data type support in codegen"),
+    llvm::cl::init(false));
+
 bool needToPackSubByteElementBitWidth(unsigned bitWidth) {
+  // Enable i1 support if requested.
+  if (clEnableI1Support && bitWidth == 1) {
+    return true;
+  }
   // Require the original bit width to be some power of two for now to avoid
   // trickiness and weirdness of packing and cross-byte access.
   // Also disallow boolean values for now--they may require separate interface
@@ -114,15 +124,14 @@ Value calculateStorageElementCountInBytes(Location loc,
   if (needToPackSubByteElementBitWidth(elementBits)) {
     assert(8 % elementBits == 0);
     unsigned byteElements = 8 / elementBits;
-    // Perform some basic sanity check to make sure the total count is byte
-    // aligned for fully static shapes.
-    if (paddedDynamicDims.empty() && (staticCount * elementBits) % 8 != 0) {
-      return nullptr;
-    }
-    auto divisor = builder.create<arith::ConstantIndexOp>(loc, byteElements);
     // TODO(antiagainst): We may want to emit runtime check to make sure this is
     // divisible.
-    value = builder.createOrFold<arith::DivUIOp>(loc, value, divisor);
+    auto divisor = builder.create<arith::ConstantIndexOp>(loc, byteElements);
+    if (!clEnableI1Support && paddedDynamicDims.empty() &&
+        (staticCount * elementBits) % 8 != 0) {
+      return nullptr;
+    }
+    value = builder.createOrFold<arith::CeilDivUIOp>(loc, value, divisor);
   }
 
   return value;
diff --git a/tests/e2e/subbyte_types/BUILD.bazel b/tests/e2e/subbyte_types/BUILD.bazel
@@ -0,0 +1,50 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Tests of end-to-end IREE support for individual ops in the TOSA dialect.
+# Each test file should have a name matching the corresponding TOSA op and test only the
+# functionality of that op (though may make use of other ops where necessary). Tests should be
+# written using the IREE Check framework.
+# See https://iree.dev/developers/general/testing-guide/#iree-core-end-to-end-e2e-tests.
+
+load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
+load("//build_tools/bazel:iree_check_test.bzl", "iree_check_single_backend_test_suite")
+
+package(
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+LLVM_SRCS = enforce_glob(
+    # keep sorted
+    [
+        "subbyte_types.mlir",
+    ],
+    include = ["*.mlir"],
+    exclude = [],
+)
+
+iree_check_single_backend_test_suite(
+    name = "check_llvm-cpu_subbyte_emulation",
+    srcs = LLVM_SRCS,
+    compiler_flags = [
+        "--iree-llvmcpu-target-cpu=generic",
+        "--iree-experimental-packed-i1-storage",
+    ],
+    driver = "local-task",
+    tags = [
+        # subbyte support for wasm is not on priorities.
+        "nowasm",
+    ],
+    target_backend = "llvm-cpu",
+)
+
+test_suite(
+    name = "check",
+    tests = [
+        ":check_llvm-cpu_subbyte_emulation",
+    ],
+)
diff --git a/tests/e2e/subbyte_types/CMakeLists.txt b/tests/e2e/subbyte_types/CMakeLists.txt
@@ -0,0 +1,29 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# tests/e2e/subbyte_types/BUILD.bazel                                          #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_check_single_backend_test_suite(
+  NAME
+    check_llvm-cpu_subbyte_emulation
+  SRCS
+    "subbyte_types.mlir"
+  TARGET_BACKEND
+    "llvm-cpu"
+  DRIVER
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-llvmcpu-target-cpu=generic"
+    "--iree-experimental-packed-i1-storage"
+  LABELS
+    "nowasm"
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/tests/e2e/subbyte_types/subbyte_types.mlir b/tests/e2e/subbyte_types/subbyte_types.mlir
@@ -0,0 +1,28 @@
+func.func @i1_type() {
+  %c0 = arith.constant 0 : index
+  %c255 = arith.constant 255 : i8
+  %input1 = util.unfoldable_constant dense<[85]> : tensor<1xi8>  // b01010101
+  %input2 = util.unfoldable_constant dense<[170]> : tensor<1xi8> // b10101010
+  %lhs = flow.tensor.bitcast %input1 : tensor<1xi8> -> tensor<8xi1>
+  %rhs = flow.tensor.bitcast %input2 : tensor<1xi8> -> tensor<8xi1>
+  %empty = tensor.empty() : tensor<8xi1>
+  %res = linalg.generic
+        {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
+        ins(%lhs, %rhs : tensor<8xi1>, tensor<8xi1>) outs(%empty: tensor<8xi1>) {
+  ^bb0(%inlhs: i1, %inrhs: i1, %out: i1):
+    %inres = arith.xori %inlhs, %inrhs: i1
+    linalg.yield %inres : i1
+  } -> tensor<8xi1>
+  %tensor_res = flow.tensor.bitcast %res : tensor<8xi1> -> tensor<1xi8>
+  check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
+  return
+}
+
+func.func @i1_type_slice() {
+  %input = util.unfoldable_constant dense<[0, 255, 0]> : tensor<3xi8>
+  %flat_input_all = flow.tensor.bitcast %input : tensor<3xi8> -> tensor<24xi1>
+  %slice = tensor.extract_slice %flat_input_all[8][8][1] : tensor<24xi1> to tensor<8xi1>
+  %tensor_res = flow.tensor.bitcast %slice : tensor<8xi1> -> tensor<1xi8>
+  check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
+  return
+}