Skip to content

Commit 6241825

Browse files
lialangiacs-epic
authored andcommitted
Support i1 datatype with an experimental flag. (iree-org#18713)
Enable packed i1 datatype storage This commit introduces support for packed storage of the `i1` (bit) datatype. When subbyte type packing is enabled via the `--iree-experimental-packed-i1-storage` option, vectors of `i1` elements will be stored in a compact packed representation. For example, a `vector<6xi1>` will occupy a single byte of memory with the 6 bit elements packed together and 2 padding bits. A `vector<3x3xi1>` will take up 2 bytes, with the 9 bit elements packed across the bytes and 7 padding bits. Limitations: - To ensure correct behavior, the tiling configuration aligns the innermost dimension data loads with byte boundaries. This is necessitated by the current lack of emulation for unaligned subbyte vector loading/storing. - Unaligned subbyte emulation support can be added in the future, though it may incur some performance overhead. This change requires corresponding updates in the frontend to utilize the packed `i1` storage format. Signed-off-by: Alan Li <[email protected]> Signed-off-by: Giacomo Serafini <[email protected]>
1 parent 5d20fdd commit 6241825

File tree

10 files changed

+191
-8
lines changed

10 files changed

+191
-8
lines changed

compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

+20
Original file line numberDiff line numberDiff line change
@@ -2915,6 +2915,26 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
29152915
}
29162916
}
29172917

2918+
// Make sure the innermost tile size times element size is multiple
2919+
// of byte bits. This is required for now because we do not fully
2920+
// support sub-byte vector stores. Once vector stores are supported
2921+
// then this can be eliminated. Note that emulating sub-byte sized vector
2922+
// loads and stores will have a performance impact.
2923+
auto resultTypes = rootOperation->getResultTypes();
2924+
if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
2925+
auto elementTypeSize =
2926+
cast<ShapedType>(rootOperation->getResultTypes().front())
2927+
.getElementType()
2928+
.getIntOrFloatBitWidth();
2929+
// for now just enable for i1
2930+
if (elementTypeSize == 1) {
2931+
auto innermostTileSize = commonVecTileSizes.back();
2932+
commonVecTileSizes.back() =
2933+
llvm::alignTo(innermostTileSize * elementTypeSize, 8) /
2934+
elementTypeSize;
2935+
}
2936+
}
2937+
29182938
// Set the lowering configs with new tile sizes.
29192939
for (auto op : computeOps) {
29202940
int numLoops = cast<TilingInterface>(op).getLoopIteratorTypes().size();

compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir

+25
Original file line numberDiff line numberDiff line change
@@ -1958,3 +1958,28 @@ func.func @test_tiling_cpu_default(%arg0: tensor<256x256xi8>, %arg1: tensor<256x
19581958
// CHECK: func @test_tiling_cpu_default(
19591959
// CHECK-SAME: translation_info = #[[TRANSLATION_INFO]]
19601960
// CHECK: linalg.quantized_matmul {lowering_config = #[[CONFIG0]]}
1961+
1962+
// -----
1963+
1964+
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
1965+
func.func @i1_type() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
1966+
%c0 = arith.constant 0 : index
1967+
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
1968+
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
1969+
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<8xi1>>
1970+
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<8xi1>
1971+
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<8xi1>
1972+
%5 = tensor.empty() : tensor<8xi1>
1973+
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<8xi1>, tensor<8xi1>) outs(%5 : tensor<8xi1>) {
1974+
^bb0(%in: i1, %in_0: i1, %out: i1):
1975+
%7 = arith.xori %in, %in_0 : i1
1976+
linalg.yield %7 : i1
1977+
} -> tensor<8xi1>
1978+
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [8], strides = [1] : tensor<8xi1> -> !flow.dispatch.tensor<writeonly:tensor<8xi1>>
1979+
return
1980+
}
1981+
1982+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8], [8], [0], [0]]>
1983+
// CHECK: func @i1_type()
1984+
// CHECK: linalg.generic {
1985+
// CHECK-SAME: {lowering_config = #[[CONFIG]]}

compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ iree_lit_test_suite(
2929
"encode_device_tensors_packing.mlir",
3030
"encode_host_tensors.mlir",
3131
"encode_host_tensors_packing.mlir",
32+
"encode_host_tensors_packing_i1.mlir",
3233
"fold_globals.mlir",
3334
"fold_uniform_operands.mlir",
3435
"fuse_dispatch_bindings.mlir",

compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ iree_lit_test_suite(
2727
"encode_device_tensors_packing.mlir"
2828
"encode_host_tensors.mlir"
2929
"encode_host_tensors_packing.mlir"
30+
"encode_host_tensors_packing_i1.mlir"
3031
"fold_globals.mlir"
3132
"fold_uniform_operands.mlir"
3233
"fuse_dispatch_bindings.mlir"

compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ util.func public @denseTensorSizeOfDynamic(%arg0: index) -> index {
8585
// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
8686
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
8787
// CHECK: %[[MUL:.+]] = arith.muli %arg0, %[[C5]] : index
88-
// CHECK: %[[DIV:.+]] = arith.divui %[[MUL]], %[[C2]] : index
88+
// CHECK: %[[DIV:.+]] = arith.ceildivui %[[MUL]], %[[C2]] : index
8989
%0 = stream.tensor.sizeof tensor<?x5xi4>{%arg0} : index
9090
// CHECK: util.return %[[DIV]]
9191
util.return %0 : index
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --iree-experimental-packed-i1-storage %s | FileCheck %s
2+
3+
func.func @unaligned_i1_size() -> index {
4+
%0 = stream.tensor.sizeof tensor<12xi1> : index
5+
return %0 : index
6+
}
7+
// CHECK: func @unaligned_i1_size() -> index {
8+
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
9+
// CHECK: return %[[C2]] : index
10+
11+
// -----
12+
13+
func.func @aligned_i1_size() -> index {
14+
%0 = stream.tensor.sizeof tensor<24xi1> : index
15+
return %0 : index
16+
}
17+
18+
// CHECK: func @aligned_i1_size() -> index {
19+
// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
20+
// CHECK: return %[[C3]] : index

compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp

+16-7
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,17 @@
1515

1616
namespace mlir::iree_compiler {
1717

18+
llvm::cl::opt<bool> clEnableI1Support(
19+
"iree-experimental-packed-i1-storage",
20+
llvm::cl::desc(
21+
"Experimental feature: enable i1 data type support in codegen"),
22+
llvm::cl::init(false));
23+
1824
bool needToPackSubByteElementBitWidth(unsigned bitWidth) {
25+
// Enable i1 support if requested.
26+
if (clEnableI1Support && bitWidth == 1) {
27+
return true;
28+
}
1929
// Require the original bit width to be some power of two for now to avoid
2030
// trickiness and weirdness of packing and cross-byte access.
2131
// Also disallow boolean values for now--they may require separate interface
@@ -114,15 +124,14 @@ Value calculateStorageElementCountInBytes(Location loc,
114124
if (needToPackSubByteElementBitWidth(elementBits)) {
115125
assert(8 % elementBits == 0);
116126
unsigned byteElements = 8 / elementBits;
117-
// Perform some basic sanity check to make sure the total count is byte
118-
// aligned for fully static shapes.
119-
if (paddedDynamicDims.empty() && (staticCount * elementBits) % 8 != 0) {
120-
return nullptr;
121-
}
122-
auto divisor = builder.create<arith::ConstantIndexOp>(loc, byteElements);
123127
// TODO(antiagainst): We may want to emit runtime check to make sure this is
124128
// divisible.
125-
value = builder.createOrFold<arith::DivUIOp>(loc, value, divisor);
129+
auto divisor = builder.create<arith::ConstantIndexOp>(loc, byteElements);
130+
if (!clEnableI1Support && paddedDynamicDims.empty() &&
131+
(staticCount * elementBits) % 8 != 0) {
132+
return nullptr;
133+
}
134+
value = builder.createOrFold<arith::CeilDivUIOp>(loc, value, divisor);
126135
}
127136

128137
return value;

tests/e2e/subbyte_types/BUILD.bazel

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright 2024 The IREE Authors
2+
#
3+
# Licensed under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
7+
# Tests of end-to-end IREE support for individual ops in the TOSA dialect.
8+
# Each test file should have a name matching the corresponding TOSA op and test only the
9+
# functionality of that op (though may make use of other ops where necessary). Tests should be
10+
# written using the IREE Check framework.
11+
# See https://iree.dev/developers/general/testing-guide/#iree-core-end-to-end-e2e-tests.
12+
13+
load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
14+
load("//build_tools/bazel:iree_check_test.bzl", "iree_check_single_backend_test_suite")
15+
16+
package(
17+
features = ["layering_check"],
18+
licenses = ["notice"], # Apache 2.0
19+
)
20+
21+
LLVM_SRCS = enforce_glob(
22+
# keep sorted
23+
[
24+
"subbyte_types.mlir",
25+
],
26+
include = ["*.mlir"],
27+
exclude = [],
28+
)
29+
30+
iree_check_single_backend_test_suite(
31+
name = "check_llvm-cpu_subbyte_emulation",
32+
srcs = LLVM_SRCS,
33+
compiler_flags = [
34+
"--iree-llvmcpu-target-cpu=generic",
35+
"--iree-experimental-packed-i1-storage",
36+
],
37+
driver = "local-task",
38+
tags = [
39+
# subbyte support for wasm is not on priorities.
40+
"nowasm",
41+
],
42+
target_backend = "llvm-cpu",
43+
)
44+
45+
test_suite(
46+
name = "check",
47+
tests = [
48+
":check_llvm-cpu_subbyte_emulation",
49+
],
50+
)
+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
################################################################################
2+
# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from #
3+
# tests/e2e/subbyte_types/BUILD.bazel #
4+
# #
5+
# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary #
6+
# CMake-only content. #
7+
# #
8+
# To disable autogeneration for this file entirely, delete this header. #
9+
################################################################################
10+
11+
iree_add_all_subdirs()
12+
13+
iree_check_single_backend_test_suite(
14+
NAME
15+
check_llvm-cpu_subbyte_emulation
16+
SRCS
17+
"subbyte_types.mlir"
18+
TARGET_BACKEND
19+
"llvm-cpu"
20+
DRIVER
21+
"local-task"
22+
COMPILER_FLAGS
23+
"--iree-llvmcpu-target-cpu=generic"
24+
"--iree-experimental-packed-i1-storage"
25+
LABELS
26+
"nowasm"
27+
)
28+
29+
### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
func.func @i1_type() {
2+
%c0 = arith.constant 0 : index
3+
%c255 = arith.constant 255 : i8
4+
%input1 = util.unfoldable_constant dense<[85]> : tensor<1xi8> // b01010101
5+
%input2 = util.unfoldable_constant dense<[170]> : tensor<1xi8> // b10101010
6+
%lhs = flow.tensor.bitcast %input1 : tensor<1xi8> -> tensor<8xi1>
7+
%rhs = flow.tensor.bitcast %input2 : tensor<1xi8> -> tensor<8xi1>
8+
%empty = tensor.empty() : tensor<8xi1>
9+
%res = linalg.generic
10+
{indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
11+
ins(%lhs, %rhs : tensor<8xi1>, tensor<8xi1>) outs(%empty: tensor<8xi1>) {
12+
^bb0(%inlhs: i1, %inrhs: i1, %out: i1):
13+
%inres = arith.xori %inlhs, %inrhs: i1
14+
linalg.yield %inres : i1
15+
} -> tensor<8xi1>
16+
%tensor_res = flow.tensor.bitcast %res : tensor<8xi1> -> tensor<1xi8>
17+
check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
18+
return
19+
}
20+
21+
func.func @i1_type_slice() {
22+
%input = util.unfoldable_constant dense<[0, 255, 0]> : tensor<3xi8>
23+
%flat_input_all = flow.tensor.bitcast %input : tensor<3xi8> -> tensor<24xi1>
24+
%slice = tensor.extract_slice %flat_input_all[8][8][1] : tensor<24xi1> to tensor<8xi1>
25+
%tensor_res = flow.tensor.bitcast %slice : tensor<8xi1> -> tensor<1xi8>
26+
check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
27+
return
28+
}

0 commit comments

Comments
 (0)