microsoft · LeiWang1999 · Aug 5, 2024 · Jul 5, 2024 · Jul 5, 2024 · Jul 5, 2024
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/bitblas/__init__.py b/bitblas/__init__.py
@@ -39,9 +39,10 @@
 from .utils import auto_detect_nvidia_target, apply_transform_on_input  # noqa: F401
 from .ops.general_matmul import MatmulConfig, Matmul  # noqa: F401
 from .ops.general_matmul_splitk import MatmulConfigWithSplitK, MatmulWithSplitK  # noqa: F401
-from .ops.matmul_dequantize import MatmulWeightOnlyDequantizeConfig, MatmulWeightOnlyDequantize  # noqa: F401
 from .module import Linear  # noqa: F401
 
+import warnings
+import functools
 import logging
 from tqdm import tqdm
 
@@ -89,4 +90,26 @@ def _init_logger():
 
 _init_logger()
 
+
+def deprecated(reason):
+    """
+    This is a decorator which can be used to mark functions as deprecated.
+    It will result in a warning being emitted when the function is used.
+    """
+
+    def decorator(func):
+
+        @functools.wraps(func)
+        def new_func(*args, **kwargs):
+            warnings.warn(
+                f"Call to deprecated function {func.__name__} ({reason}).",
+                category=DeprecationWarning,
+                stacklevel=2)
+            return func(*args, **kwargs)
+
+        return new_func
+
+    return decorator
+
+
 __version__ = "0.0.1.dev13"
diff --git a/bitblas/base/roller/policy/tensorcore.py b/bitblas/base/roller/policy/tensorcore.py
@@ -117,83 +117,92 @@ def _check_small_tile(td: TileDict):
                     return True
             return False
 
-        if not _check_small_tile(td):
-            return None
+        if _check_small_tile(td):
+
+            smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap)
+            rstep_map = td.rstep_map.copy()
+
+            def _optimize(node, rstep):
+                all_steps = self.get_node_reduce_step_candidates(node)
+                # todo(lei): optimize the all_steps enlarge policy to be a multiple of the original all_steps[k]
+                for k in all_steps:
+                    all_steps[k] = list(filter(lambda x: x % rstep[k] == 0, all_steps[k]))
+                if any([v == [] for v in all_steps.values()]):
+                    return rstep
+
+                def _shared_memory_usage(td: TileDict):
+                    return node.footprint(td.output_tile, new_rstep_map,
+                                          td.tensor_strides_map[node])
+
+                def _score(rstep_id):
+                    rstep = {
+                        k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis
+                    }
+                    score = 0
+                    shape = node.propagate_inputs_on_reduction(td.get_tile(node), rstep=rstep)
+                    input_buffers = node.block_analyzer.get_input_buffers(node.reduction_block)
+                    for i, input_buffer in enumerate(input_buffers):
+                        score += coalesced_factor(shape[i], input_buffer.shape)
+                    return score
+
+                def _enlarge(rstep_id):
+                    candidates = []
+                    for ax in rstep_id:
+                        if rstep_id[ax] + 1 == len(all_steps[ax]):
+                            continue
+                        r = rstep_id.copy()
+                        r[ax] += 1
+                        candidates.append((r, _score(r)))
+                    if len(candidates) == 0:
+                        return None
+                    return max(candidates, key=lambda x: x[1])[0]
+
+                cur_rstep_id = {
+                    k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
+                }
+                new_rstep_map = rstep_map.copy()
+                while True:
+                    new_rstep_id = _enlarge(cur_rstep_id)
+                    if new_rstep_id is None:
+                        break
+                    new_rstep_map = {
+                        k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]]
+                        for k in node.raxis
+                    }
+                    old_rstep_map = td.rstep_map
+                    td.rstep_map = new_rstep_map
+                    smem_usage, _ = _shared_memory_usage(td)
+                    td.rstep_map = old_rstep_map
+                    if smem_usage > smem_limit:
+                        break
+                    else:
+                        cur_rstep_id = new_rstep_id
+                rstep = {
+                    k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis
+                }
+                return rstep
 
-        smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap)
-        rstep_map = td.rstep_map.copy()
+            for node in self.ordered_nodes:
+                if len(node.raxis) > 0:
+                    rstep = _optimize(node, rstep_map)
+                    rstep_map = rstep
 
-        def _optimize(node, rstep):
-            all_steps = self.get_node_reduce_step_candidates(node)
-            # todo(lei): optimize the all_steps enlarge policy to be a multiple of the original all_steps[k]
-            for k in all_steps:
-                all_steps[k] = list(filter(lambda x: x % rstep[k] == 0, all_steps[k]))
-            if any([v == [] for v in all_steps.values()]):
-                return rstep
+            td.rstep_map = rstep_map
+            td.smem_cost, td.cached_tensors_map = self._compute_shared_memory_usage(td)
 
-            def _shared_memory_usage(td: TileDict):
-                return node.footprint(td.output_tile, new_rstep_map, td.tensor_strides_map[node])
+        if self.block_reduction_depth is not None:
 
-            def _score(rstep_id):
-                rstep = {
-                    k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis
-                }
-                score = 0
-                shape = node.propagate_inputs_on_reduction(td.get_tile(node), rstep=rstep)
-                input_buffers = node.block_analyzer.get_input_buffers(node.reduction_block)
-                for i, input_buffer in enumerate(input_buffers):
-                    score += coalesced_factor(shape[i], input_buffer.shape)
-                return score
-
-            def _enlarge(rstep_id):
-                candidates = []
-                for ax in rstep_id:
-                    if rstep_id[ax] + 1 == len(all_steps[ax]):
-                        continue
-                    r = rstep_id.copy()
-                    r[ax] += 1
-                    candidates.append((r, _score(r)))
-                if len(candidates) == 0:
-                    return None
-                return max(candidates, key=lambda x: x[1])[0]
-
-            cur_rstep_id = {
-                k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
-            }
-            new_rstep_map = rstep_map.copy()
-            while True:
-                new_rstep_id = _enlarge(cur_rstep_id)
-                if new_rstep_id is None:
-                    break
-                new_rstep_map = {
-                    k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis
-                }
-                old_rstep_map = td.rstep_map
-                td.rstep_map = new_rstep_map
-                smem_usage, _ = _shared_memory_usage(td)
-                td.rstep_map = old_rstep_map
-                if smem_usage > smem_limit:
-                    break
-                else:
-                    cur_rstep_id = new_rstep_id
-            rstep = {
-                k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis
-            }
-            return rstep
+            def _expand_with_tags(rstep):
+                new_rstep = {k: v * self.block_reduction_depth for k, v in rstep.items()}
+                return new_rstep
+
+            rstep_map = td.rstep_map.copy()
+            for node in self.ordered_nodes:
+                if len(node.raxis) > 0:
+                    rstep = _expand_with_tags(rstep_map)
+                    rstep_map = rstep
+            td.rstep_map = rstep_map
 
-        for node in self.ordered_nodes:
-            if len(node.raxis) > 0:
-                rstep = _optimize(node, rstep_map)
-                rstep_map = rstep
-
-        # if is_block_reduction:
-        #     # If block reduction, we should constrain the max value is 64
-        #     # Otherwise it will introduce an issue of cuda invalid args.
-        #     MAX_REDUCE_K = 64
-        #     for k in rstep_map:
-        #         rstep_map[k] = min(rstep_map[k], MAX_REDUCE_K)
-        td.rstep_map = rstep_map
-        td.smem_cost, td.cached_tensors_map = self._compute_shared_memory_usage(td)
         return
 
     def get_node_reduce_step_candidates(self, node):
@@ -318,12 +327,15 @@ def _score(node, thread):  # small is better
         # smem capacity
         # TODO: This is a dummy mul which avoid reusing some shared memory.
         # Should be removed in the future.
-        if td.smem_cost > (self.arch.smem_cap * 1.3):
+        if td.smem_cost > (self.arch.smem_cap):
             info_message = f"Tile Dict: {td.output_tile} Shared memory exceeds the static capacity," \
                 " use dynamic shared memory."
             logger.info(info_message)
             codegen_dict.shared_scope = "shared.dyn"
 
+        # Or assume we always use shared memory
+        # codegen_dict.shared_scope = "shared.dyn"
+
         codegen_dict.complete_config(node)
         codegen_dict.vectorize = self._plan_vectorize(self.prim_func_node, td, block_size)
         codegen_dict.arch = self.arch

diff --git a/bitblas/gpu/matmul_analysis.py b/bitblas/gpu/matmul_analysis.py
@@ -622,14 +622,16 @@ def check_last_trait(region: List[Range]):
         # Analysis Block Reduction Optimization
         # Currently, we only support block reduction depth 2 for small M
         # When the func is a dequantize like ops, we should consider the M
+        require_block_reduce = False
         if hasattr(func.attrs, "dequantize_info"):
             for arg in func.params:
                 inp_shape = func.buffer_map[arg].shape
                 M = inp_shape[0]
                 if isinstance(M, tir.IntImm) and M <= 128:
-                    tags["block_reduction_depth"] = 2
+                    require_block_reduce = True
                 break
-
+        if require_block_reduce and check_sm_version(target.arch) == 80:
+            tags["block_reduction_depth"] = 2
         return tags
 
     (main_block,) = reduction_blocks

diff --git a/bitblas/gpu/matmul_mma_dequantize.py b/bitblas/gpu/matmul_mma_dequantize.py
@@ -2264,10 +2264,11 @@ def get_idx():
                 lop3_intrin_info["compute"],
             )
             # Assume the grouped K is the last dim of the scaling
-            grouped_k = sch.get(bf).reads[1].buffer.shape[-1]
-            # TODO(lei): This is a hack to get the loop extent
-            loop_extent = 8 if out_dtype == "float16" else 16
-            sch.unsafe_inject_call_argument(bf, -2, loop_extent * grouped_k)
+            if "with_scaling" in weight_decode_info and weight_decode_info["with_scaling"]:
+                grouped_k = sch.get(bf).reads[1].buffer.shape[-1]
+                # TODO(lei): This is a hack to get the loop extent
+                loop_extent = 8 if out_dtype == "float16" else 16
+                sch.unsafe_inject_call_argument(bf, -2, loop_extent * grouped_k)
             import_source.append(lop3_intrin_info["c_source"])
 
         def tensorize_init_store_compute():

diff --git a/bitblas/module/__init__.py b/bitblas/module/__init__.py
@@ -40,6 +40,24 @@ def unpack_qzeros(qzeros, bits):
     return torch.bitwise_and(unpacked_zeros + 1, 2**bits - 1)
 
 
+def unpack_qweight(qweight, bits):
+    qweight = qweight.view(torch.int8)
+    elems_per_int8 = 8 // bits
+    unpacked_weight = torch.zeros(
+        (qweight.shape[0], qweight.shape[1] * elems_per_int8),
+        dtype=torch.int8,
+        device=qweight.device,
+        requires_grad=False,
+    )
+    for col in range(unpacked_weight.shape[1]):
+        i = col % elems_per_int8
+        unpacked_weight[:, col] = (qweight[:, col // elems_per_int8] >> (bits * i))
+
+    # Follow the instruction in AutoGPTQ qlinear_cuda_old.py line 303
+    # NOTE: It appears that casting after the `unpacked_zeros  + 1` is important.
+    return torch.bitwise_and(unpacked_weight, 2**bits - 1)
+
+
 class Linear(nn.Module):
     opt_M = [1, 16, 32, 64, 128, 256, 512]
     STORAGE_DTYPE = "int8"  # assume int8 storage
@@ -279,8 +297,9 @@ def load_and_transform_weight(
     def repack_from_gptq(self, gptq_module):
         # qweight in gptq old quant linear stored with (out_features, in_features), should be transposed.
         qweight = gptq_module.qweight.T.contiguous().view(self.TORCH_STORAGE_DTYPE)
+        intweight = unpack_qweight(qweight, self.bits).contiguous()
         if self.bitblas_matmul.weight_transform is not None:
-            qweight = self.bitblas_matmul.weight_transform(qweight.cpu()).cuda()
+            qweight = self.bitblas_matmul.weight_transform(intweight.cpu()).cuda()
         self.qweight = qweight
         # scales in gptq old quant linear stored with (in_features // group_size, out_features), should be transposed.
         scales = gptq_module.scales.T.contiguous().view(self.torch_dtype)

diff --git a/bitblas/ops/__init__.py b/bitblas/ops/__init__.py
@@ -1,8 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 from .operator import Operator, OperatorConfig  # noqa: F401
-from .matmul import Matmul, MatmulConfig  # noqa: F401
-from .matmul_dequantize import MatmulWeightOnlyDequantize, MatmulWeightOnlyDequantizeConfig  # noqa: F401
+from .general_matmul import Matmul, MatmulConfig  # noqa: F401
 from .ladder_permutate import LadderPermutate, LadderPermutateConfig  # noqa: F401
 from .lop3_permutate import LOP3Permutate, LOP3PermutateConfig  # noqa: F401
 from .quant_compress import QuantCompress, QuantCompressConfig  # noqa: F401