From e7ed676c18814389cbae0a30518439b633d08ab1 Mon Sep 17 00:00:00 2001 From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com> Date: Sun, 30 Jun 2024 21:04:45 +0900 Subject: [PATCH] [Dev] Potentially improve performance through block reduction (#63) * improve e4m3 decoding. * append fp16xint1 * Update submodule commit reference * chore: Update shared memory scope for float32 output dtype * BUGFIX: UINT8/INT8 Decoding * feat: Add rasterization options for roller module * Refactor tensorcore_legalization method to optimize tensor core usage * feat: Add function to collect variables from expression, improve for splitk * chore: Update typing import in __init__.py * chore: Refactor CPU execution of operators * Refactor matmul implementation for splitk layout * Refactor matmul implementation for splitk layout * Refactor matmul implementation for splitk layout * chore: Update version to 0.0.1.dev8 * chore: Enable debug output in bitblas.set_debug_level() * Refactor Linear module matmul implementation for splitk layout * Refactor matmul implementation for splitk layout * Refactor CUDA kernel launch string for dynamic symbolic set * Bumpt version to v0.0.1.dev9 * Refactor CUDA kernel launch string for dynamic symbolic set * Bump version to v0.0.1.dev10 * Refactor CUDA kernel launch string for dynamic symbolic set * Bump version to v0.0.1.dev12 and add MatmulConfigWithSplitK and MatmulWithSplitK * fix the typo * Refactor CUDA kernel launch string for dynamic symbolic set * Refactor CUDA kernel launch string for dynamic symbolic set * Refactor CUDA kernel launch string for dynamic symbolic set * Refactor CUDA kernel launch string for dynamic symbolic set --------- Co-authored-by: LeiWang199 --- 3rdparty/tvm | 2 +- docs/QuickStart.md | 6 +- python/bitblas/__init__.py | 2 + python/bitblas/base/roller/arch/cuda.py | 6 +- python/bitblas/base/roller/hint.py | 12 +- .../bitblas/base/roller/policy/tensorcore.py | 15 + python/bitblas/base/utils.py | 5 +- python/bitblas/gpu/gemv.py | 2 +- python/bitblas/gpu/gemv_dequantize.py | 1 - python/bitblas/gpu/matmul_analysis.py | 10 + python/bitblas/gpu/matmul_mma.py | 376 +++++++++- python/bitblas/gpu/matmul_mma_dequantize.py | 690 +++++++++++++++++- python/bitblas/quantization/quantization.py | 5 +- python/bitblas/wrapper/general.py | 8 +- .../test_general_matmul_splitk_ops.py | 11 +- 15 files changed, 1102 insertions(+), 49 deletions(-) diff --git a/3rdparty/tvm b/3rdparty/tvm index 618306ce3..a077796b9 160000 --- a/3rdparty/tvm +++ b/3rdparty/tvm @@ -1 +1 @@ -Subproject commit 618306ce3baa2c606d43856afbe6655e4e67b2c8 +Subproject commit a077796b9e2dd3b2275fbaa212786645758c360d diff --git a/docs/QuickStart.md b/docs/QuickStart.md index 5a57edbb2..10f9948d3 100644 --- a/docs/QuickStart.md +++ b/docs/QuickStart.md @@ -14,7 +14,7 @@ import torch # enabling debug output -bitblas.set_debug_level("Debug") +bitblas.set_log_level("Debug") matmul_config = bitblas.MatmulConfig( M=1, # M dimension N=1024, # N dimension @@ -129,7 +129,7 @@ import bitblas import torch # enabling debug output -bitblas.set_debug_level("Debug") +bitblas.set_log_level("Debug") model = bitblas.Linear( in_features=1024, @@ -185,7 +185,7 @@ from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import ( ) # enabling debug output -bitblas.set_debug_level("Debug") +bitblas.set_log_level("Debug") in_features = 1024 out_features = 1024 diff --git a/python/bitblas/__init__.py b/python/bitblas/__init__.py index 6a18c4703..14b510845 100644 --- a/python/bitblas/__init__.py +++ b/python/bitblas/__init__.py @@ -40,6 +40,7 @@ import logging from tqdm import tqdm + class TqdmLoggingHandler(logging.Handler): """ Custom logging handler that directs log output to tqdm progress bar to avoid interference. """ @@ -61,6 +62,7 @@ def set_log_level(level): Args: level (str or int): Can be the string name of the level (e.g., 'INFO') or the actual level (e.g., logging.INFO). + OPTIONS: 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' """ if isinstance(level, str): level = getattr(logging, level.upper(), logging.INFO) diff --git a/python/bitblas/base/roller/arch/cuda.py b/python/bitblas/base/roller/arch/cuda.py index 63775ecbe..2189947e7 100644 --- a/python/bitblas/base/roller/arch/cuda.py +++ b/python/bitblas/base/roller/arch/cuda.py @@ -4,7 +4,7 @@ import tvm from tvm.target import Target from .arch_base import TileDevice -from typing import List, Dict +from typing import List, Dict, Union def check_sm_version(arch: str) -> int: @@ -28,7 +28,9 @@ def __init__( class CUDA(TileDevice): - def __init__(self, target: Target): + def __init__(self, target: Union[Target, str]): + if isinstance(target, str): + target = tvm.target.Target(target) self.target = target self.sm_version = check_sm_version(self.target.arch) device = tvm.runtime.cuda(0) diff --git a/python/bitblas/base/roller/hint.py b/python/bitblas/base/roller/hint.py index 89f607cde..f6e2fb03a 100644 --- a/python/bitblas/base/roller/hint.py +++ b/python/bitblas/base/roller/hint.py @@ -154,18 +154,20 @@ def __init__(self) -> None: self.arch = None self.use_tc = None # todo(lei): this should be renamed. - # special axes tiling info + # Special axes tiling info self.block = [] self.thread = [] - # special axes for tensorCore + # Special axes for MMA self.warp = [] - # reduce axes tiling info + # Reduce axes tiling info self.rstep = [] self.reduce_thread = [] self.rasterization_plan = NoRasterization() self.cached_tensors = [] self.output_strides = {} self.schedule_stages = None + # Config for block reduction + self.block_reduction_depth = None # type: int # Experimental self._raxis_order = [] @@ -203,6 +205,10 @@ def to_dict(self) -> Dict: dic["raxis_order"] = self._raxis_order if self.vectorize != {}: dic["vectorize"] = self.vectorize + if self.pipeline_stage != 1: + dic["pipeline_stage"] = self.pipeline_stage + if self.block_reduction_depth is not None: + dic["block_reduction_depth"] = self.block_reduction_depth return dic def from_dict(self, dic: Dict) -> "Hint": diff --git a/python/bitblas/base/roller/policy/tensorcore.py b/python/bitblas/base/roller/policy/tensorcore.py index 97edb50fc..f4047ef08 100644 --- a/python/bitblas/base/roller/policy/tensorcore.py +++ b/python/bitblas/base/roller/policy/tensorcore.py @@ -25,6 +25,7 @@ def __init__(self, self.wmma_k = 16 self.pipeline_stage: int = 1 self.use_async_copy: bool = False + self.block_reduction_depth: Optional[int] = None self._legalize_info() def _legalize_info(self): @@ -44,6 +45,11 @@ def _legalize_info(self): self.use_async_copy = True else: self.use_async_copy = False + # TODO: block reduction depth is not used for now. + # As there still exists some performance issues for block reduction. + # block_reduction_depth = self.prim_func_node.get_tag("block_reduction_depth") + # if block_reduction_depth: + # self.block_reduction_depth = block_reduction_depth def _compute_tc_strides( self, @@ -114,6 +120,7 @@ def _check_small_tile(td: TileDict): smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap) rstep_map = td.rstep_map.copy() + is_block_reduction = self.block_reduction_depth is not None def _optimize(node, rstep): all_steps = self.get_node_reduce_step_candidates(node) @@ -177,6 +184,13 @@ def _enlarge(rstep_id): if len(node.raxis) > 0: rstep = _optimize(node, rstep_map) rstep_map = rstep + + if is_block_reduction: + # If block reduction, we should constrain the max value is 64 + # Otherwise it will introduce an issue of cuda invalid args. + MAX_REDUCE_K = 64 + for k in rstep_map: + rstep_map[k] = min(rstep_map[k], MAX_REDUCE_K) td.rstep_map = rstep_map td.smem_cost, td.cached_tensors_map = self._compute_shared_memory_usage(td) return @@ -289,6 +303,7 @@ def _score(node, thread): # small is better codegen_dict.warp = warp_tile codegen_dict.use_tc = True codegen_dict.pipeline_stage = self.pipeline_stage + codegen_dict.block_reduction_depth = self.block_reduction_depth codegen_dict.use_async = self.use_async_copy codegen_dict.rstep = [int(rsteps[ax.var.name]) for ax in node.raxis] codegen_dict.cached_tensors = td.cached_tensors_map[node] diff --git a/python/bitblas/base/utils.py b/python/bitblas/base/utils.py index 7da309dd5..50adc135f 100644 --- a/python/bitblas/base/utils.py +++ b/python/bitblas/base/utils.py @@ -168,6 +168,7 @@ def apply_and_build_parallel(func, arch, num_repeats=3, max_workers=10, + timeout=30, data_distribution="uniform") -> CompileResult: cpresults = [] @@ -187,10 +188,10 @@ def _apply_schedule(f, c): with ThreadPoolExecutor(max_workers=4) as scheduler: futures = {scheduler.submit(_apply_schedule, func, config) for config in configs} - for future in as_completed(futures): + for future in as_completed(futures, timeout=timeout): _sched.append(future.result()) - builder = PopenPoolExecutor(max_workers=max_workers) + builder = PopenPoolExecutor(max_workers=max_workers, timeout=timeout) # build in process parallel def _build(context) -> str: diff --git a/python/bitblas/gpu/gemv.py b/python/bitblas/gpu/gemv.py index 7b08179d3..60a290a81 100644 --- a/python/bitblas/gpu/gemv.py +++ b/python/bitblas/gpu/gemv.py @@ -21,7 +21,7 @@ # Modifications Copyright (c) Microsoft. # The code below is mostly copied from apache/tvm gemv.py in dlight. """A rule for GEMV and DecodeGEMV.""" -import re + from functools import reduce from typing import List, Optional, Union, Dict diff --git a/python/bitblas/gpu/gemv_dequantize.py b/python/bitblas/gpu/gemv_dequantize.py index 5a6405f52..5ccc5b40e 100644 --- a/python/bitblas/gpu/gemv_dequantize.py +++ b/python/bitblas/gpu/gemv_dequantize.py @@ -287,7 +287,6 @@ def get_vectorize_factor(target_format): assert len(config.thread) == 2, "SplitK only support 2D thread config" num_warps = int(num_warps // config.thread[0]) - # get target dequantize buffer's idx def get_idx(weight_decode_info: Dict): # for LUT dequantize, the expr is LUT(w), the idx is 1 diff --git a/python/bitblas/gpu/matmul_analysis.py b/python/bitblas/gpu/matmul_analysis.py index 4638f2e72..6537a555a 100644 --- a/python/bitblas/gpu/matmul_analysis.py +++ b/python/bitblas/gpu/matmul_analysis.py @@ -619,6 +619,16 @@ def check_last_trait(region: List[Range]): if func.attrs is not None and "weight_transform_kind" in func.attrs: intrin_info["weight_transform_kind"] = func.attrs["weight_transform_kind"] tags["intrin_info"] = intrin_info + # Analysis Block Reduction Optimization + # Currently, we only support block reduction depth 2 for small M + # When the func is a dequantize like ops, we should consider the M + if hasattr(func.attrs, "dequantize_info"): + for arg in func.params: + inp_shape = func.buffer_map[arg].shape + M = inp_shape[0] + if isinstance(M, tir.IntImm) and M <= 128: + tags["block_reduction_depth"] = 2 + break return tags diff --git a/python/bitblas/gpu/matmul_mma.py b/python/bitblas/gpu/matmul_mma.py index a20359e11..4bf8be4e6 100644 --- a/python/bitblas/gpu/matmul_mma.py +++ b/python/bitblas/gpu/matmul_mma.py @@ -8,6 +8,7 @@ from tvm import tir, DataType from tvm.target import Target +from ..base.roller import Hint from ..base.roller.rasterization import NoRasterization from ..base import analysis from .base import GPUScheduleRule @@ -338,12 +339,15 @@ def store_output(block_outer, write_buffer_idx): def apply_config( # pylint: disable=too-many-locals,missing-docstring self, func: tir.PrimFunc, - config, + config: Hint, ) -> Optional[tir.Schedule]: if "dequantize_info" in func.attrs: dequantize_rule = MatmulTensorizationMMAWithDequantizeInfo() return dequantize_rule.apply_config(func, config) + if hasattr(config, "block_reduction_depth") and config.block_reduction_depth is not None: + return self.apply_block_reduction_with_config(func, config) + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel get_mma_intrin_group,) @@ -686,7 +690,375 @@ def tensorize_init_store_compute(): sch.annotate( sch.get_loops(block_init_c)[-2], ann_key="inject_customized_code_prepend", - ann_val=invoke_func) + ann_val=invoke_func, + ) + # plan import source + if len(import_source) > 0: + sch.annotate( + thread_idz, + ann_key="pragma_import_c", + ann_val=("\n").join(import_source), + ) + return sch + + def apply_block_reduction_with_config( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + config: Hint, + ) -> Optional[tir.Schedule]: + if "dequantize_info" in func.attrs: + dequantize_rule = MatmulTensorizationMMAWithDequantizeInfo() + return dequantize_rule.sch_shared_memory_prefetch_block_reduction_with_config( + func, config) + + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_mma_intrin_group,) + + import_source: List[str] = [] + + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if func.attrs is not None and "dlight.do_not_tensorize" in func.attrs.keys(): + return None + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + main_block = reduction_blocks[0] + + output_blocks = [sch.get(block) for block in sch.get_output_blocks(root_block)] + + def check_require_cache(func: tir.PrimFunc, config): + conditions: List[bool] = [] + + # check if has dynamic symbolic + def check_has_dynamic(func: tir.PrimFunc): + for param in func.params: + if param not in func.buffer_map: + continue + arg = func.buffer_map[param] + for i in arg.shape: + if isinstance(i, tir.Var): + return True + return False + + conditions.append(check_has_dynamic(func)) + # check if has post process + conditions.append(sch.get(main_block) not in output_blocks) + # check if not use async copy + conditions.append(config.use_async is False) + return any(conditions) + + cache_write_required = check_require_cache(func, config=config) + + # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K]/B[S, K, J] + if not (func.attrs is not None and "dlight.tensorcore_prenormlized" in func.attrs.keys()): + sch = normalize_to_matmul(sch, main_block, ["a", "a", "a"]) + + shared_scope = config.shared_scope + + intrin_info = config.intrin_info + intrin_group = get_mma_intrin_group( + load_scope=shared_scope, + store_scope=shared_scope if cache_write_required else "global", + a_dtype=intrin_info.in_dtype, + b_dtype=intrin_info.in_dtype, + out_dtype=intrin_info.out_dtype, + trans_a=intrin_info.trans_a, + trans_b=intrin_info.trans_b, + smooth_a=intrin_info.smooth_a, + smooth_b=intrin_info.smooth_b, + not_use_mma_store_intrinic=False, + ) + + # Start Schedule + # Step 0. Get schedule config. + warp_row_tiles = config.warp[0] + warp_col_tiles = config.warp[1] + block_row_warps = config.block[0] // warp_row_tiles + block_col_warps = config.block[1] // warp_col_tiles + stage = config.pipeline_stage + use_async = config.use_async + assert (config.block_reduction_depth is not None), "block_reduction_depth is required" + reduce_k = config.block_reduction_depth + chunk = config.rstep[0] // reduce_k + + # tensor core intrinsic size + micro_size_x, micro_size_y, micro_size_k = intrin_group["micro_kernel"] + + # get the axis for layout transform + def get_axis(l, r, trans): # noqa: E741 + return (r, l) if trans else (l, r) # noqa: E741 + + a_lr = get_axis(micro_size_x, micro_size_k, intrin_info.trans_a) + b_lr = get_axis(micro_size_k, micro_size_y, intrin_info.trans_b) + + def can_enable_swizzle(dtype: str, smooth: bool): + # inject_permuted_layout only support float16 currently + if dtype == "float16" or dtype == "int8": + # introduce the constraint of reduce_k because reduce_k will doubling the size of + if (chunk * reduce_k) * DataType(dtype).bits != (512): + # currently the swizzle rule only support 512 bit. + return False + # if we use smooth layout, we don't need to do swizzling + return not smooth + return False + + can_swizzle_a = can_enable_swizzle(intrin_info.in_dtype, intrin_info.inter_transform_a) + can_swizzle_b = can_enable_swizzle(intrin_info.in_dtype, intrin_info.inter_transform_b) + + warp_size = 32 + + i_factors, j_factors, k_factors = ( + [None, 1, block_row_warps, warp_row_tiles // micro_size_x], + [1, None, block_col_warps, warp_col_tiles // micro_size_y], + [None, (reduce_k * chunk) // micro_size_k], + ) + + num_ty = i_factors[2] + num_tz = j_factors[2] + x_pad_factor = i_factors[2] * i_factors[3] + y_pad_factor = j_factors[2] * j_factors[3] + k_pad_factor = k_factors[1] + + # Step 2. Padding for dynamic shape kernels + sch.pad_einsum( + main_block, + [ + 1, + micro_size_x * x_pad_factor, + micro_size_y * y_pad_factor, + micro_size_k * k_pad_factor, + ], + ) + + # Step 3. Schedule matmul to use tensor core + block = main_block + + batch, i, j, k = sch.get_loops(block) + + # inner loops for tensor core computation + i, i_inner = sch.split(i, factors=[None, micro_size_x]) + j, j_inner = sch.split(j, factors=[None, micro_size_y]) + k, k_inner = sch.split(k, factors=[None, micro_size_k]) + + sch.reorder(i, j, k, i_inner, j_inner, k_inner) + + block_inner = block + block_outer = sch.blockize(i_inner) + + i0, i1, i2, i3 = sch.split(i, factors=i_factors) + j0, j1, j2, j3 = sch.split(j, factors=j_factors) + k0, k1 = sch.split(k, k_factors) + k0, kr = sch.split(k0, [None, reduce_k]) + + sch.reorder(i0, j0, i1, j1, i2, j2, kr, i3, j3, k0, k1) + + block_idy = sch.fuse(i0, j0) + block_idx = sch.fuse(i1, j1) + thread_idy = i2 + thread_idz = j2 + + sch.bind(batch, "blockIdx.z") + sch.bind(block_idx, "blockIdx.x") + sch.bind(block_idy, "blockIdx.y") + thread_idz = j2 = thread_idy = sch.fuse(thread_idy, thread_idz) + sch.bind(thread_idy, "threadIdx.y") + sch.bind(kr, "threadIdx.z") + + # rewrite smooth layout of shared memory + def smooth_smem_layout_rewrite(block, scope, l=16, r=16, enable=True): # noqa: E741 + if not enable: + return + sch.transform_layout( + block, + scope, + lambda b, i, j: ( + b, + i // l, + j // r, + i % l, + j % r, + ), + ) + + smooth_smem_layout_rewrite( + block_outer, ("read", 0), *a_lr, enable=intrin_info.inter_transform_a) + smooth_smem_layout_rewrite( + block_outer, ("read", 1), *b_lr, enable=intrin_info.inter_transform_b) + smooth_smem_layout_rewrite(block_outer, ("write", 0), enable=True) + + def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False, trans=False): + block_read = sch.cache_read(block, idx, shared_scope) + sch.compute_at(block_read, k0, preserve_unit_loops=True) + ndim = len(sch.get(block_read).iter_vars) + fused = sch.fuse(*sch.get_loops(block_read)[-ndim:]) + + f_r, f_0, f_1, f_2, f_3, f_4 = sch.split( + fused, factors=[reduce_k, num_ty, num_tz, None, warp_size, vec_len]) + + sch.bind(f_3, "threadIdx.x") + f_0 = f_1 = sch.fuse(f_0, f_1) + sch.bind(f_0, "threadIdx.y") + sch.bind(f_r, "threadIdx.z") + sch.vectorize(f_4) + sch.unroll(f_2) + # Apply Swizzling + sch.annotate(block_read, ann_key="permuted_layout", ann_val=can_swizzle) + # if not, apply padding to alleviate bank conflict + if not (can_swizzle or is_smooth): + pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 + sch.storage_align(block_read, 0, axis=-2, factor=16, offset=pad_offset) + sch.annotate(f_2, "pragma_unroll_explicit", False) + return block_read + + if len(config.vectorize.values()) < 2: + return None + + a_g2s = fetch_to_shared( + block_outer, + 0, + vec_len=list(config.vectorize.values())[0], + can_swizzle=can_swizzle_a, + is_smooth=intrin_info.smooth_a, + trans=intrin_info.trans_a, + ) + b_g2s = fetch_to_shared( + block_outer, + 1, + vec_len=list(config.vectorize.values())[1], + can_swizzle=can_swizzle_b, + is_smooth=intrin_info.smooth_b, + trans=intrin_info.trans_b, + ) + + # rewrite global smooth layout + def smooth_gmem_layout_rewrite(sch, block, enable=True, trans=False, matrix_name="A"): + if not enable: + return + # step1: find the first producer block + # Notes: we assume the layout propagate happens in the first producer block + # otherwise, the layout transform will have no effect as it will transform both + # read and write buffer + producers = _collect_producers(sch, block) + g2s_block = a_g2s if matrix_name == "A" else b_g2s + propagate_block: tir.Block = (producers[-1] if len(producers) > 0 else g2s_block) + + # step2: transform the layout with inverse permutation + intra_indexmap, _ = get_propagate_map( + trans=trans, dtype=intrin_info.in_dtype, matrix_name=matrix_name) + + def inverse_permutation(i, j, ii, jj): + return (i, j, *intra_indexmap.map_indices([ii, jj])) + + sch.transform_layout(propagate_block, ("read", 0), inverse_permutation) + + smooth_gmem_layout_rewrite( + sch, a_g2s, intrin_info.smooth_a, intrin_info.trans_a, matrix_name="A") + smooth_gmem_layout_rewrite( + sch, b_g2s, intrin_info.smooth_b, intrin_info.trans_b, matrix_name="B") + auto_inline_producers(sch, a_g2s) + auto_inline_producers(sch, b_g2s) + + # create read cache to load matrix from shared memory to wmma fragments + A_mat = sch.cache_read(block_outer, 0, "warp") + B_mat = sch.cache_read(block_outer, 1, "warp") + sch.compute_at(A_mat, k1) + sch.compute_at(B_mat, k1) + + # create write cache to store matrix from wmma fragments to shared memory and global memory + if cache_write_required: + accumulator_shared_to_global = sch.cache_write(block_outer, 0, shared_scope) + + store = sch.cache_write(block_outer, 0, "warp") + sch.reverse_compute_at(store, j2) + + # split the store loop to match hardware intrinsic pattern + i, j = sch.get_loops(store)[-2:] + i0, i1 = sch.split(i, factors=[None, micro_size_x], preserve_unit_iters=False) + j0, j1 = sch.split(j, factors=[None, micro_size_y], preserve_unit_iters=False) + sch.reorder(i0, j0, i1, j1) + + if cache_write_required: + auto_inline_consumer_chain(sch, accumulator_shared_to_global) + sch.reverse_compute_at( + accumulator_shared_to_global, + sch.get_loops(store)[-5], + preserve_unit_loops=True, + ) + vec_len = get_coalesced_veclen(sch.get(accumulator_shared_to_global)) + fused = sch.fuse(*sch.get_loops(accumulator_shared_to_global)[-5:]) + f0, f1, f2 = sch.split(fused, factors=[None, warp_size, vec_len]) + sch.bind(f1, "threadIdx.x") + sch.vectorize(f2) + sch.unroll(f0) + sch.annotate(f0, "pragma_unroll_explicit", False) + else: + auto_inline_consumer_chain(sch, store) + + block_init_c = sch.decompose_reduction(block_outer, k0) + block_init_c_inner = sch.get_child_blocks(block_init_c)[0] + + # Tensorization by hardware intrinsics + index_map_a, index_map_b, index_map_c = intrin_group["index_map"] + + sch.transform_layout( + A_mat, + ("write", 0), + get_warp_index_map(index_map_a, *a_lr, intrin_info.inter_transform_a), + ) + sch.transform_layout( + B_mat, + ("write", 0), + get_warp_index_map(index_map_b, *b_lr, intrin_info.inter_transform_b), + ) + sch.transform_layout( + store, + ("read", 0), + get_warp_index_map(index_map_c, is_5d=True), + ) + + i, j = sch.get_loops(A_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, a_lr[0]]) + j0, j1 = sch.split(j, factors=[None, a_lr[1]]) + sch.reorder(i0, j0, i1, j1) + ba = sch.blockize(i1) + # sch.annotate(ba, ann_key="permuted_layout", ann_val=can_swizzle_a) + sch.tensorize(ba, intrin_group["load_a"]) + + i, j = sch.get_loops(B_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, b_lr[0]]) + j0, j1 = sch.split(j, factors=[None, b_lr[1]]) + sch.reorder(i0, j0, i1, j1) + bb = sch.blockize(i1) + # sch.annotate(bb, ann_key="permuted_layout", ann_val=can_swizzle_b) + sch.tensorize(bb, intrin_group["load_b"]) + + def tensorize_init_store_compute(): + sch.tensorize(sch.get_loops(block_init_c_inner)[-2], intrin_group["init"]) + sch.tensorize(sch.get_loops(store)[-2], intrin_group["store"]) + sch.tensorize(sch.get_loops(block_inner)[-3], intrin_group["compute"]) + + tensorize_init_store_compute() + + if stage > 1: + sch.annotate(k0, ann_key="software_pipeline_stage", ann_val=[0, 0, stage - 1]) + sch.annotate(k0, ann_key="software_pipeline_order", ann_val=[0, 1, 2]) + if use_async: + sch.annotate(k0, "software_pipeline_async_stages", [0]) + + # plan rasteration + if not isinstance(config.rasterization_plan, NoRasterization): + device_func, invoke_func = config.rasterization_plan.get_code() + import_source.append(device_func) + sch.annotate( + sch.get_loops(block_init_c)[-2], + ann_key="inject_customized_code_prepend", + ann_val=invoke_func, + ) # plan import source if len(import_source) > 0: sch.annotate( diff --git a/python/bitblas/gpu/matmul_mma_dequantize.py b/python/bitblas/gpu/matmul_mma_dequantize.py index 96461db45..679e84395 100644 --- a/python/bitblas/gpu/matmul_mma_dequantize.py +++ b/python/bitblas/gpu/matmul_mma_dequantize.py @@ -27,6 +27,94 @@ ) +def _bind_thread_based_on_config(sch, block, block_row_warps, block_col_warps, warp_size): + # assume the block loops has been fused + last_loop = sch.get_loops(block)[-1] + loop_extent = sch.get(last_loop).extent + vec = get_coalesced_veclen(sch.get(block)) + + if loop_extent // (vec * warp_size) == 0: + last_loop, B_shared_tx, B_shared_vi = sch.split(last_loop, factors=[1, warp_size, None]) + sch.bind(B_shared_tx, "threadIdx.x") + sch.vectorize(B_shared_vi) + loop_extent = sch.get(last_loop).extent + + # warp_size - 1 handling for 32x7 alike case, which may cause unaligned threadIdx.x mapping. + if loop_extent // vec >= 1 and loop_extent & (vec - 1) == 0 and (loop_extent // + vec) & (warp_size - 1) == 0: + last_loop, B_shared_vi = sch.split(last_loop, factors=[None, vec]) + sch.vectorize(B_shared_vi) + loop_extent = sch.get(last_loop).extent + + if loop_extent // warp_size >= 1 and loop_extent % warp_size == 0: + last_loop, B_shared_tx = sch.split(last_loop, factors=[None, warp_size]) + sch.bind(B_shared_tx, "threadIdx.x") + loop_extent = sch.get(last_loop).extent + + if loop_extent // block_row_warps >= 1 and loop_extent % block_row_warps == 0: + last_loop, B_shared_ty = sch.split(last_loop, factors=[None, block_row_warps]) + sch.bind(B_shared_ty, "threadIdx.y") + loop_extent = sch.get(last_loop).extent + + if loop_extent // block_col_warps >= 1 and loop_extent % block_col_warps == 0: + last_loop, B_shared_tz = sch.split(last_loop, factors=[None, block_col_warps]) + sch.bind(B_shared_tz, "threadIdx.z") + loop_extent = sch.get(last_loop).extent + + sch.unroll(last_loop) + sch.annotate(last_loop, "pragma_unroll_explicit", False) + + +def _bind_thread_based_with_block_reduce_on_config(sch, block, block_row_warps, block_col_warps, + warp_size, reduce_k): + # assume the block loops has been fused + last_loop = sch.get_loops(block)[-1] + loop_extent = sch.get(last_loop).extent + vec = get_coalesced_veclen(sch.get(block)) + + if loop_extent // (vec * warp_size) == 0: + last_loop, B_shared_tx, B_shared_vi = sch.split(last_loop, factors=[1, warp_size, None]) + sch.bind(B_shared_tx, "threadIdx.x") + sch.vectorize(B_shared_vi) + loop_extent = sch.get(last_loop).extent + + # warp_size - 1 handling for 32x7 alike case, which may cause unaligned threadIdx.x mapping. + if loop_extent // vec >= 1 and loop_extent & (vec - 1) == 0 and (loop_extent // + vec) & (warp_size - 1) == 0: + last_loop, B_shared_vi = sch.split(last_loop, factors=[None, vec]) + sch.vectorize(B_shared_vi) + loop_extent = sch.get(last_loop).extent + + if loop_extent // warp_size >= 1 and loop_extent % warp_size == 0: + last_loop, B_shared_tx = sch.split(last_loop, factors=[None, warp_size]) + sch.bind(B_shared_tx, "threadIdx.x") + loop_extent = sch.get(last_loop).extent + + B_shared_ty = None + if loop_extent // block_row_warps >= 1 and loop_extent % block_row_warps == 0: + last_loop, B_shared_ty = sch.split(last_loop, factors=[None, block_row_warps]) + loop_extent = sch.get(last_loop).extent + + B_shared_tz = None + if loop_extent // block_col_warps >= 1 and loop_extent % block_col_warps == 0: + last_loop, B_shared_tz = sch.split(last_loop, factors=[None, block_col_warps]) + loop_extent = sch.get(last_loop).extent + + if B_shared_ty and B_shared_tz: + B_shared_ty = sch.fuse(B_shared_tz, B_shared_ty) + sch.bind(B_shared_ty, "threadIdx.y") + elif B_shared_ty: + sch.bind(B_shared_ty, "threadIdx.y") + + if loop_extent // reduce_k >= 1 and loop_extent % reduce_k == 0: + last_loop, B_shared_tk = sch.split(last_loop, factors=[None, reduce_k]) + sch.bind(B_shared_tk, "threadIdx.z") + loop_extent = sch.get(last_loop).extent + + sch.unroll(last_loop) + sch.annotate(last_loop, "pragma_unroll_explicit", False) + + def get_index_map_3d(index_map, l=16, r=16): # noqa: E741 def index_map_3d(b, i, j): @@ -359,7 +447,7 @@ def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False): if not (can_swizzle or is_smooth): pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 sch.storage_align(block_read, 0, axis=-2, factor=16, offset=pad_offset) - sch.annotate(f_2, "pragma_unroll_explicit", False) + sch.annotate(f_0, "pragma_unroll_explicit", False) return block_read a_g2s = fetch_to_shared( @@ -579,7 +667,7 @@ def tensorize_init_store_compute(): def sch_dequantize_in_register_with_config( self, func: tir.PrimFunc, - config, + config: Hint, ): """ For devices without async copy, we can use a simple dequantize schedule without shared memory prefetch. @@ -841,13 +929,14 @@ def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False): sch.bind(f_1, "threadIdx.y") sch.vectorize(f_4) sch.unroll(f_0) + sch.annotate(f_0, "pragma_unroll_explicit", False) + # Apply Swizzling sch.annotate(block_read, ann_key="permuted_layout", ann_val=can_swizzle) # if not, apply padding to alleviate bank conflict if not (can_swizzle or is_smooth): pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 sch.storage_align(block_read, 0, axis=-2, factor=16, offset=pad_offset) - sch.annotate(f_2, "pragma_unroll_explicit", False) return block_read a_g2s = fetch_to_shared( @@ -1067,7 +1156,7 @@ def tensorize_init_store_compute(): def sch_shared_memory_prefetch_with_config( self, func: tir.PrimFunc, - config, + config: Hint, ): """ For A100 Like devices, the shared memory prefetch(async) is required @@ -1083,6 +1172,9 @@ def sch_shared_memory_prefetch_with_config( V compute """ + if hasattr(config, "block_reduction_depth") and config.block_reduction_depth is not None: + return self.sch_shared_memory_prefetch_block_reduction_with_config(func, config) + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel get_mma_intrin_group,) from .intrin import get_lop3_intrin_group @@ -1136,8 +1228,6 @@ def check_weight_decode_info(weight_decode_info): # Start Schedule # Step 0. Get schedule config. - # NOTE: we can analyze the config by the hardware spec in the future - # tensor core intrinsic size shared_scope = config.shared_scope @@ -1387,13 +1477,14 @@ def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False): sch.bind(f_1, "threadIdx.y") sch.vectorize(f_4) sch.unroll(f_0) + sch.annotate(f_0, "pragma_unroll_explicit", False) + # Apply Swizzling sch.annotate(block_read, ann_key="permuted_layout", ann_val=can_swizzle) # if not, apply padding to alleviate bank conflict if not (can_swizzle or is_smooth): pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 sch.storage_align(block_read, 0, axis=-2, factor=16, offset=pad_offset) - sch.annotate(f_2, "pragma_unroll_explicit", False) return block_read a_g2s = fetch_to_shared( @@ -1496,25 +1587,10 @@ def get_idx(): sch.compute_at(block_shared_local_local_shared, k0, preserve_unit_loops=True) ndim = len(sch.get(block_shared_local_local_shared).iter_vars) - fused = sch.fuse(*sch.get_loops(block_shared_local_local_shared)[-ndim:]) - - f_0, f_1, f_2, f_3, f_4 = sch.split( - fused, - factors=[ - None, - num_tz, - num_ty, - warp_size, - get_coalesced_veclen(sch.get(block_shared_local_local_shared)), - ], - ) + _ = sch.fuse(*sch.get_loops(block_shared_local_local_shared)[-ndim:]) - sch.bind(f_3, "threadIdx.x") - sch.bind(f_2, "threadIdx.y") - sch.bind(f_1, "threadIdx.z") - sch.vectorize(f_4) - sch.unroll(f_0) - sch.annotate(f_0, "pragma_unroll_explicit", False) + _bind_thread_based_on_config(sch, block_shared_local_local_shared, num_ty, num_tz, + warp_size) # cache small tensors, e.g. LUT if b_idx: @@ -1636,10 +1712,572 @@ def tensorize_init_store_compute(): ) return sch + def sch_shared_memory_prefetch_block_reduction_with_config( + self, + func: tir.PrimFunc, + config: Hint, + ): + """ + For A100 Like devices, the shared memory prefetch(async) is required + to achieve optimal performance. + quantized weight + | + V + shared memory prefetch (with async copy) + | + V + dequantized into shared memory + | + V + compute + """ + from tvm.tir.tensor_intrin.cuda import ( # pylint: disable=import-outside-toplevel + get_mma_intrin_group,) + from .intrin import get_lop3_intrin_group + + import_source: List[str] = [] + + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if func.attrs is not None and "dlight.do_not_tensorize" in func.attrs.keys(): + return None + + reduction_blocks = get_reduction_blocks(sch, blocks) + if reduction_blocks is None: + return None + + main_block = reduction_blocks[0] + # always enable shared memory rewrite + cache_write_required = True + + # Check Dequantize Info + # TODO(leiwang): this is a hack to get the configuration, can be improved by writing a pass to analysis the dequantize block. + dequantize_info = func.attrs["dequantize_info"] + + def check_dequantize_info(dequantize_info): + conditions = [] + # currently only support weight only dequantization + conditions.append(len(dequantize_info) == 1) + # TODO(@lei) check if the dequantize value name is weight + return all(conditions) + + assert check_dequantize_info(dequantize_info) + + (weight_decode_info,) = list(dequantize_info.values()) + + def check_weight_decode_info(weight_decode_info): + conditions = [] + # check source format in ["int", "fp", "nf"] + conditions.append("source_format" in weight_decode_info) + conditions.append(weight_decode_info["source_format"]["format"] in + ["uint", "int", "fp", "nf", "fp_e4m3"]) + # check source bits in [1, 2, 4, 8] + conditions.append(weight_decode_info["source_format"]["bits"] in [1, 2, 4, 8]) + # check target format in ["float16", "int8"] + conditions.append("target_format" in weight_decode_info) + conditions.append(weight_decode_info["target_format"] in ["float16", "int8"]) + return all(conditions) + + assert check_weight_decode_info(weight_decode_info), "Invalid B_decode_info" + + # Start Schedule + # Step 0. Get schedule config. + # tensor core intrinsic size + shared_scope = config.shared_scope + + intrin_info = config.intrin_info + intrin_group = get_mma_intrin_group( + load_scope=shared_scope, + store_scope=shared_scope if cache_write_required else "global", + a_dtype=intrin_info.in_dtype, + b_dtype=intrin_info.in_dtype, + out_dtype=intrin_info.out_dtype, + trans_a=intrin_info.trans_a, + trans_b=intrin_info.trans_b, + smooth_a=intrin_info.smooth_a, + smooth_b=intrin_info.smooth_b, + not_use_mma_store_intrinic=False, + ) + + warp_row_tiles = config.warp[0] + warp_col_tiles = config.warp[1] + block_row_warps = config.block[0] // warp_row_tiles + block_col_warps = config.block[1] // warp_col_tiles + stage = config.pipeline_stage + use_async = config.use_async + assert (config.block_reduction_depth is not None), "block_reduction_depth is required" + reduce_k = config.block_reduction_depth + chunk = config.rstep[0] // reduce_k + + micro_size_x, micro_size_y, micro_size_k = intrin_group["micro_kernel"] + + # get the axis for layout transform + def get_axis(l, r, trans): # noqa: E741 + return (r, l) if trans else (l, r) # noqa: E741 + + a_lr = get_axis(micro_size_x, micro_size_k, intrin_info.trans_a) + b_lr = get_axis(micro_size_k, micro_size_y, intrin_info.trans_b) + + def can_enable_swizzle(dtype: str, smooth: bool): + # inject_permuted_layout only support float16 currently + if dtype == "float16" or dtype == "int8": + if (chunk * reduce_k) * DataType(dtype).bits != 512: + # currently the swizzle rule only support 512 bit. + return False + # if we use smooth layout, we don't need to do swizzling + return not smooth + return False + + can_swizzle_a = can_enable_swizzle(intrin_info.in_dtype, intrin_info.inter_transform_a) + can_swizzle_b = can_enable_swizzle(intrin_info.in_dtype, intrin_info.inter_transform_b) + + # rewrite global smooth layout, for dequantize, currently only support weight only recover. + def smooth_gmem_layout_rewrite( + sch, + main_block, + enable=True, + trans=False, + matrix_name="A", + intrin_group=intrin_group, + ): + if not enable: + return + + # normalized block may have three read buffers, while the first one is the write buffer. + buffer_offset = (1 if sch.get(main_block).reads[0].buffer + == sch.get(main_block).writes[0].buffer else 0) + buffer_idx = 0 if matrix_name == "A" else 1 + source_buffer = sch.get(main_block).reads[buffer_offset + buffer_idx].buffer + + # step1: find the first producer block + # Notes: we assume the layout propagate happens in the first producer block + # otherwise, the layout transform will have no effect as it will transform both + # read and write buffer + propagate_block: tir.Block = find_last_producer_from_buffer( + sch, main_block, source_buffer) + # some trick impl may not have reindex block + (weight_dequantize_info,) = dequantize_info.values() + if (sch.get(propagate_block).name_hint == weight_dequantize_info["decode_block"]): + return + # step2: transform the layout with inverse permutation + intra_indexmap, _ = get_propagate_map( + trans=trans, dtype=intrin_info.in_dtype, matrix_name=matrix_name) + + # step3: propagate the matmul layout to the first reindex block + + intra_indexmap = layout_propagate_chain( + sch, + start_block=main_block, + start_buffer=source_buffer, + end_block=propagate_block, + index_map=intra_indexmap, + ) + + def inverse_permutation(i, j, ii, jj): + return (i, j, *intra_indexmap.map_indices([ii, jj])) + + sch.transform_layout(propagate_block, ("read", 0), inverse_permutation) + + intra_index_map, _ = get_propagate_map( + trans=trans, dtype=intrin_info.in_dtype, matrix_name=matrix_name) + + # get target dequantize buffer's offset + def get_offset(): + # for LUT dequantize, the expr is LUT(w), the idx is 1 + # maybe we can use a more general and structural based way + # to analysis the idx + if weight_dequantize_info["source_format"]["format"] == "nf": + return 1 + return 0 + + offset = get_offset() + dequantize_block = sch.get_block(weight_dequantize_info["decode_block"]) + group_size = weight_dequantize_info["group_size"] + + _, mn, mk = intrin_group["micro_kernel"] + + def get_param_indices( + indexmap, + l=mn, + r=mk, + group_size=group_size # noqa: E741 + ): # noqa: E741 + # assume the param layout is n, k + rl, rr = [x.var for x in sch.get(dequantize_block).iter_vars] + warp_i, warp_j = rl % l, rr % r + spatial_i, spatial_j = rl // l, rr // r + warp_i, warp_j = indexmap.map_indices([warp_i, warp_j]) + new_indices = ( + spatial_i * l + warp_i, + (spatial_j * r + warp_j) // group_size, + ) + return new_indices + + with_scaling = bool(weight_dequantize_info["with_scaling"]) + if with_scaling: + sch.unsafe_rewrite_buffer_region( + dequantize_block, + ("read", offset + 1), + get_param_indices(intra_index_map), + ) + with_zeros = bool(weight_dequantize_info["with_zeros"]) + if with_zeros: + sch.unsafe_rewrite_buffer_region( + dequantize_block, + ("read", offset + 2), + get_param_indices(intra_index_map), + ) + + smooth_gmem_layout_rewrite( + sch, main_block, intrin_info.smooth_a, intrin_info.trans_a, matrix_name="A") + + smooth_gmem_layout_rewrite( + sch, main_block, intrin_info.smooth_b, intrin_info.trans_b, matrix_name="B") + + warp_size = 32 + + i_factors, j_factors, k_factors = ( + [None, 1, block_row_warps, warp_row_tiles // micro_size_x], + [1, None, block_col_warps, warp_col_tiles // micro_size_y], + [None, chunk // micro_size_k], + ) + + num_ty = i_factors[2] + num_tz = j_factors[2] + x_pad_factor = i_factors[2] * i_factors[3] + y_pad_factor = j_factors[2] * j_factors[3] + k_pad_factor = k_factors[1] + + # Step 1. Normalize generic matmul to C[S, I, J] += A[S, I, K] * B[S, J, K]/B[S, K, J] + if not (func.attrs is not None and "dlight.tensorcore_prenormlized" in func.attrs.keys()): + sch = normalize_to_matmul(sch, main_block, ["a", "a", "a"]) + + # Step 2. Padding for dynamic shape kernels + sch.pad_einsum( + main_block, + [ + 1, + micro_size_x * x_pad_factor, + micro_size_y * y_pad_factor, + micro_size_k * k_pad_factor, + ], + ) + + # Step 3. Schedule matmul to use tensor core + block = main_block + + batch, i, j, k = sch.get_loops(block) + + # inner loops for tensor core computation + i, i_inner = sch.split(i, factors=[None, micro_size_x]) + j, j_inner = sch.split(j, factors=[None, micro_size_y]) + k, k_inner = sch.split(k, factors=[None, micro_size_k]) + + sch.reorder(i, j, k, i_inner, j_inner, k_inner) + + block_inner = block + block_outer = sch.blockize(i_inner) + + i0, i1, i2, i3 = sch.split(i, factors=i_factors) + j0, j1, j2, j3 = sch.split(j, factors=j_factors) + k0, k1 = sch.split(k, k_factors) + k0, kr = sch.split(k0, [None, reduce_k]) + + sch.reorder(i0, j0, i1, j1, i2, j2, kr, i3, j3, k0, k1) + + block_idy = sch.fuse(i0, j0) + block_idx = sch.fuse(i1, j1) + thread_idy = i2 + thread_idz = j2 + + sch.bind(batch, "blockIdx.z") + sch.bind(block_idx, "blockIdx.x") + sch.bind(block_idy, "blockIdx.y") + thread_idz = j2 = thread_idy = sch.fuse(thread_idy, thread_idz) + sch.bind(thread_idy, "threadIdx.y") + sch.bind(kr, "threadIdx.z") + + def smooth_layout_recover(block, scope, l=16, r=16, enable=True): # noqa: E741 + if not enable: + return + sch.transform_layout( + block, + scope, + lambda b, i, j: ( + b, + i // l, + j // r, + i % l, + j % r, + ), + ) + + smooth_layout_recover(block_outer, ("read", 0), *a_lr, enable=intrin_info.inter_transform_a) + smooth_layout_recover( + block_outer, + ("read", 1), + *b_lr, + enable=intrin_info.inter_transform_b, + ) + smooth_layout_recover(block_outer, ("write", 0), enable=True) + + def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False): + block_read = sch.cache_read(block, idx, shared_scope) + sch.compute_at(block_read, k0, preserve_unit_loops=True) + ndim = len(sch.get(block_read).iter_vars) + fused = sch.fuse(*sch.get_loops(block_read)[-ndim:]) + + f_0, f_r, f_1, f_2, f_3, f_4 = sch.split( + fused, factors=[None, reduce_k, num_ty, num_tz, warp_size, vec_len]) + + sch.bind(f_3, "threadIdx.x") + f_1 = f_2 = sch.fuse(f_1, f_2) + sch.bind(f_1, "threadIdx.y") + sch.bind(f_r, "threadIdx.z") + sch.vectorize(f_4) + sch.unroll(f_0) + sch.annotate(f_0, "pragma_unroll_explicit", False) + + # Apply Swizzling + sch.annotate(block_read, ann_key="permuted_layout", ann_val=can_swizzle) + # if not, apply padding to alleviate bank conflict + if not (can_swizzle or is_smooth): + pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 + sch.storage_align(block_read, 0, axis=-2, factor=16, offset=pad_offset) + return block_read + + a_g2s = fetch_to_shared( + block_outer, + 0, + vec_len=list(config.vectorize.values())[0], + can_swizzle=can_swizzle_a, + is_smooth=intrin_info.smooth_a, + ) + + auto_inline_producers(sch, a_g2s) + + def decode_fetch_to_shared(block, idx): + # step1. create memory hierarchy + # global -> local -> shared + block_shared = sch.cache_read(block, idx, shared_scope) + sch.compute_at(block_shared, k0, preserve_unit_loops=True) + + # TODO(lei): the factor should be analyzed more deeper. + decode_factor = get_coalesced_veclen(sch.get(block_shared)) + _, B_shared_vi, _ = sch.split( + sch.get_loops(block_shared)[-1], factors=[None, 1, decode_factor]) + block_shared_local = sch.cache_read(block_shared, 0, "local") + # global -> dequantzed_local -> shared + # step2. inline to local block, should skip qzeros + is_qzeros = ("with_zeros" in weight_decode_info and weight_decode_info["with_zeros"] and + weight_decode_info["zeros_mode"] == "quantized") + weight_dequantize_block = sch.get_block(weight_decode_info["decode_block"]) + weight_producers = ( + _collect_producers(sch, weight_dequantize_block) if is_qzeros else []) + auto_inline_producers(sch, block_shared_local, weight_producers) + + # get target dequantize buffer's idx + def get_idx(): + # for LUT dequantize, the expr is LUT(w), the idx is 1 + # maybe we can use a more general and structural based way + # to analysis the idx + if weight_decode_info["source_format"]["format"] == "nf": + return 1 + return 0 + + b_idx = get_idx() + # global -> prefetch_local -> dequantzed_local -> shared + block_shared_local_local = sch.cache_read(block_shared_local, b_idx, "local") + # global -> prefetch_shared -> vector load -> dequantzed_local -> shared + block_shared_local_local_shared = sch.cache_read(block_shared_local_local, 0, + shared_scope) + sch.compute_at(block_shared_local, B_shared_vi, preserve_unit_loops=True) + sch.compute_at(block_shared_local_local, B_shared_vi, preserve_unit_loops=True) + + dequantize_block_local = block_shared_local + if is_qzeros: + if ("with_scaling" in weight_decode_info and weight_decode_info["with_scaling"]): + block_local_scales = sch.cache_read(dequantize_block_local, b_idx + 1, "local") + sch.compute_at(block_local_scales, B_shared_vi, preserve_unit_loops=True) + auto_inline_producers(sch, block_local_scales) + + if ("with_zeros" in weight_decode_info and weight_decode_info["with_zeros"]): + block_local_zeros = sch.cache_read(dequantize_block_local, b_idx + 2, "local") + sch.compute_at(block_local_zeros, B_shared_vi, preserve_unit_loops=True) + auto_inline_producers(sch, block_local_zeros) + + for producer in weight_producers: + with suppress(Exception): + auto_inline_producers(sch, producer) + sch.compute_inline(producer) + + # fast type conversion + if ("fast_decoding" in weight_decode_info and weight_decode_info["fast_decoding"]): + source_bit = weight_decode_info["source_format"]["bits"] + out_dtype = weight_decode_info["target_format"] + lop3_intrin_info = get_lop3_intrin_group( + out_dtype=out_dtype, + storage_dtype=weight_decode_info["storage_dtype"], + source_format=weight_decode_info["source_format"]["format"], + source_bit=source_bit, + with_scaling=weight_decode_info["with_scaling"], + with_zeros=weight_decode_info["with_zeros"], + zeros_mode=weight_decode_info["zeros_mode"], + ) + sch.tensorize( + sch.get_loops(dequantize_block_local)[-1], + lop3_intrin_info["compute"], + ) + import_source.append(lop3_intrin_info["c_source"]) + + sch.annotate(block_shared, ann_key="permuted_layout", ann_val=can_swizzle_b) + union_len = (2 + 4) if intrin_info.smooth_b else (2 + 2) + B_shared_fused = sch.fuse(*sch.get_loops(block_shared)[-union_len:-2]) + _, B_shared_rk, B_shared_ty, B_shared_tz, B_shared_tx = sch.split( + B_shared_fused, factors=[None, reduce_k, num_ty, num_tz, warp_size]) + if not (can_swizzle_b or intrin_info.smooth_b): + pad_offset = 8 if intrin_info.in_dtype == "float16" else 16 + sch.storage_align(block_shared, 0, axis=-2, factor=16, offset=pad_offset) + sch.bind(B_shared_tx, "threadIdx.x") + B_shared_tz = B_shared_ty = sch.fuse(B_shared_ty, B_shared_tz) + sch.bind(B_shared_ty, "threadIdx.y") + sch.bind(B_shared_rk, "threadIdx.z") + sch.vectorize(sch.get_loops(block_shared)[-1]) + sch.vectorize(sch.get_loops(block_shared_local_local)[-1]) + + sch.compute_at(block_shared_local_local_shared, k0, preserve_unit_loops=True) + ndim = len(sch.get(block_shared_local_local_shared).iter_vars) + _ = sch.fuse(*sch.get_loops(block_shared_local_local_shared)[-ndim:]) + + _bind_thread_based_with_block_reduce_on_config(sch, block_shared_local_local_shared, + num_ty, num_tz, warp_size, reduce_k) + + # cache small tensors, e.g. LUT + if b_idx: + block_shared_lut = sch.cache_read(dequantize_block_local, 0, shared_scope) + sch.reverse_compute_at(block_shared_lut, j2) + _, B_shared_tx = sch.split( + sch.get_loops(block_shared_lut)[-1], factors=[None, warp_size]) + sch.bind(B_shared_tx, "threadIdx.x") + return block_shared_local + + _ = decode_fetch_to_shared(block_outer, 1) + + # create read cache to load matrix from shared memory to wmma fragments + A_mat = sch.cache_read(block_outer, 0, "warp") + B_mat = sch.cache_read(block_outer, 1, "warp") + sch.compute_at(A_mat, k1, preserve_unit_loops=True) + sch.compute_at(B_mat, k1, preserve_unit_loops=True) + + # create write cache to store matrix from wmma fragments to shared memory and global memory + if cache_write_required: + accumulator_shared_to_global = sch.cache_write(block_outer, 0, shared_scope) + + store = sch.cache_write(block_outer, 0, "warp") + sch.reverse_compute_at(store, j2) + + # split the store loop to match hardware intrinsic pattern + i, j = sch.get_loops(store)[-2:] + i0, i1 = sch.split(i, factors=[None, micro_size_x]) + j0, j1 = sch.split(j, factors=[None, micro_size_y]) + sch.reorder(i0, j0, i1, j1) + + if cache_write_required: + auto_inline_consumer_chain(sch, accumulator_shared_to_global) + sch.reverse_compute_at( + accumulator_shared_to_global, + sch.get_loops(store)[-5], + preserve_unit_loops=True, + ) + vec_len = get_coalesced_veclen(sch.get(accumulator_shared_to_global)) + fused = sch.fuse(*sch.get_loops(accumulator_shared_to_global)[-5:]) + f0, f1, f2 = sch.split(fused, factors=[None, warp_size, vec_len]) + sch.bind(f1, "threadIdx.x") + sch.vectorize(f2) + sch.unroll(f0) + sch.annotate(f0, "pragma_unroll_explicit", False) + else: + auto_inline_consumer_chain(sch, store) + + block_init_c = sch.decompose_reduction(block_outer, k0) + block_init_c_inner = sch.get_child_blocks(block_init_c)[0] + + # Tensorization by hardware intrinsics + + index_map_a, index_map_b, index_map_c = intrin_group["index_map"] + + sch.transform_layout( + A_mat, + ("write", 0), + get_index_map(index_map_a, *a_lr, intrin_info.inter_transform_a), + ) + sch.transform_layout( + B_mat, + ("write", 0), + get_index_map(index_map_b, *b_lr, intrin_info.inter_transform_b), + ) + sch.transform_layout( + store, + ("read", 0), + get_index_map(index_map_c, is_5d=True), + ) + + i, j = sch.get_loops(A_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, a_lr[0]]) + j0, j1 = sch.split(j, factors=[None, a_lr[1]]) + sch.reorder(i0, j0, i1, j1) + ba = sch.blockize(i1) + sch.annotate(ba, ann_key="permuted_layout", ann_val=can_swizzle_a) + sch.tensorize(ba, intrin_group["load_a"]) + + i, j = sch.get_loops(B_mat)[-2:] + i0, i1 = sch.split(i, factors=[None, b_lr[0]]) + j0, j1 = sch.split(j, factors=[None, b_lr[1]]) + sch.reorder(i0, j0, i1, j1) + bb = sch.blockize(i1) + sch.annotate(bb, ann_key="permuted_layout", ann_val=can_swizzle_b) + sch.tensorize(bb, intrin_group["load_b"]) + + def tensorize_init_store_compute(): + sch.tensorize(sch.get_loops(block_init_c_inner)[-2], intrin_group["init"]) + sch.tensorize(sch.get_loops(store)[-2], intrin_group["store"]) + sch.tensorize(sch.get_loops(block_inner)[-3], intrin_group["compute"]) + + tensorize_init_store_compute() + + if stage > 1: + sch.annotate( + k0, + ann_key="software_pipeline_stage", + ann_val=[0, 0, stage - 1, stage - 1], + ) + sch.annotate(k0, ann_key="software_pipeline_order", ann_val=[0, 1, 2, 3]) + if use_async: + sch.annotate(k0, "software_pipeline_async_stages", [0]) + # plan rasteration + if not isinstance(config.rasterization_plan, NoRasterization): + device_func, invoke_func = config.rasterization_plan.get_code() + import_source.append(device_func) + sch.annotate( + sch.get_loops(block_init_c)[-2], + ann_key="inject_customized_code_prepend", + ann_val=invoke_func, + ) + # plan import source + if len(import_source) > 0: + sch.annotate( + j2, + ann_key="pragma_import_c", + ann_val=("\n").join(import_source), + ) + return sch + def apply_config( # pylint: disable=too-many-locals,missing-docstring self, func: tir.PrimFunc, - config, + config: Hint, ) -> Optional[tir.Schedule]: def check_sm_version(arch: str) -> int: diff --git a/python/bitblas/quantization/quantization.py b/python/bitblas/quantization/quantization.py index b08f3d188..71ef224d7 100644 --- a/python/bitblas/quantization/quantization.py +++ b/python/bitblas/quantization/quantization.py @@ -144,10 +144,12 @@ def _tir_u8_to_f8_e4m3_to_f16_naive(nbit: int, val: tir.PrimExpr, dtype: str): assert dtype == "float16" s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16") e4 = val & tir.const(0x40, "uint16") - prefix = tir.Select(e4 == tir.const(0, "uint16"), tir.const(0x2000, "uint16"), tir.const(0x4000, "uint16")) + prefix = tir.Select(e4 == tir.const(0, "uint16"), tir.const(0x2000, "uint16"), + tir.const(0x4000, "uint16")) e_f16 = (((val & tir.const(63, "uint16")) << tir.const(7, "uint16"))) | prefix return tir.reinterpret("float16", s_f16 | e_f16) + def _tir_u8_to_f8_e4m3_to_f16(nbit: int, val: tir.PrimExpr, dtype: str): assert nbit == 8 assert dtype == "float16" @@ -157,6 +159,7 @@ def _tir_u8_to_f8_e4m3_to_f16(nbit: int, val: tir.PrimExpr, dtype: str): e_f16 = e_f16 ^ tir.const(0x2000, "uint16") return tir.reinterpret("float16", s_f16 | e_f16) + def _tir_u8_to_f8_e5m2_to_f16(nbit: int, val: tir.PrimExpr, dtype: str): assert nbit == 8 assert dtype == "float16" diff --git a/python/bitblas/wrapper/general.py b/python/bitblas/wrapper/general.py index 6c34f6e4e..58aa8d226 100644 --- a/python/bitblas/wrapper/general.py +++ b/python/bitblas/wrapper/general.py @@ -291,8 +291,8 @@ def legalize_c(p): call_str = "if ({} == 0) return; \n\t\t".format(list(dynamic_symbolic_set)[0]) else: call_str = "" - call_str += "{}<<<{}, {}, {}, stream>>>({});".format(function_name, grid_str, block_str, smem_str, - call_args) + call_str += "{}<<<{}, {}, {}, stream>>>({});".format(function_name, grid_str, block_str, + smem_str, call_args) # Create the host function wrapper for the CUDA kernel host_func = """ extern "C" void call({}) {{ @@ -410,9 +410,7 @@ def legalize_c(p): (symbolic,) = list(dynamic_symbolic_set) range_str = opt_shapes[symbolic] if last_range == 0: - call_str = "if ({} == 0) return; \n".format( - symbolic, - ) + call_str = "if ({} == 0) return; \n".format(symbolic,) call_str += "if ({} <= {}) {{\n\t\t\t {}<<<{}, {}, {}, stream>>>({}); \n\t\t}}\n".format( symbolic, range_str, diff --git a/testing/python/operators/test_general_matmul_splitk_ops.py b/testing/python/operators/test_general_matmul_splitk_ops.py index 5d21116f1..12fbbcabe 100644 --- a/testing/python/operators/test_general_matmul_splitk_ops.py +++ b/testing/python/operators/test_general_matmul_splitk_ops.py @@ -41,6 +41,7 @@ def test_matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtyp matmul = MatmulWithSplitK(config=matmul_config, enable_tuning=False) assert get_codegen_result(matmul) + @pytest.mark.parametrize( "SPlitK,M,N,K,A_dtype,W_dtype,accum_dtype,out_dtype,layout,with_bias,group_size,with_scaling,with_zeros,zeros_mode", [ @@ -83,6 +84,7 @@ def test_matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accu output_torch = torch.matmul(inputs[0], inputs[1].t() if layout == "nt" else inputs[1]) torch.testing.assert_close(output_bitblas, output_torch, rtol=1e-2, atol=1e-1) + @pytest.mark.parametrize( "SPlitK,M,N,K,A_dtype,W_dtype,accum_dtype,out_dtype,layout,with_bias,group_size,with_scaling,with_zeros,zeros_mode", [ @@ -93,8 +95,8 @@ def test_matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accu ], ) def test_matmul_torch_forward_fp8e4m3(SplitK, M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, - layout, with_bias, group_size, with_scaling, with_zeros, - zeros_mode): + layout, with_bias, group_size, with_scaling, with_zeros, + zeros_mode): import torch torch.random.manual_seed(0) matmul_config = MatmulConfigWithSplitK( @@ -119,6 +121,7 @@ def test_matmul_torch_forward_fp8e4m3(SplitK, M, N, K, A_dtype, W_dtype, accum_d input_shape = (M, K) weight_shape = (N, K) if layout == "nt" else (K, N) + def map_torch_type(intype): typemap = { @@ -148,6 +151,10 @@ def map_torch_type(intype): print("torch_ref_out", ref_out) print("bitblas_out", bitblas_out) + matmul.forward(torch_a, torch_b, output=bitblas_out) + print("torch_ref_out", ref_out) + print("bitblas_out", bitblas_out) + torch.testing.assert_close(bitblas_out, ref_out, rtol=1e0, atol=1e-1)