From 3e906da6e91651c03ec1653636b74da59ba94adc Mon Sep 17 00:00:00 2001
From: Lingxiao Ma <lingxiao.ma@microsoft.com>
Date: Thu, 6 Jun 2024 06:31:02 +0000
Subject: [PATCH 1/4] update README

---
 README.md | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7a979c8e..0a5af379 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # BitBLAS
 
 BitBLAS is a library to support mixed-precision BLAS operations on GPUs, for example, the $W_{wdtype}A_{adtype}$ mixed-precision matrix multiplication where $C_{cdtype}[M, N] = A_{adtype}[M, K] \times W_{wdtype}[N, K]$.
-BitBLAS aims to support efficient mixed-precision DNN model deployment, especially the $W_{wdtype}A_{adtype}$ quantization in large language models (LLMs), for example, the $W_{UINT4}A_{FP16}$ in [GPTQ](https://arxiv.org/abs/2210.17323), the $W_{INT2}A_{FP16}$ in [BitDistiller](https://arxiv.org/abs/2402.10631), the $W_{INT2}A_{INT8}$ in [BitNet-b1.58](https://arxiv.org/abs/2402.17764). BitBLAS is based on techniques from our accepted submission at OSDI'24.
+BitBLAS aims to support efficient mixed-precision DNN model deployment, especially the $W_{wdtype}A_{adtype}$ quantization in large language models (LLMs), for example, the $W_{UINT4}A_{FP16}$ in [GPTQ](https://arxiv.org/abs/2210.17323), the $W_{INT2}A_{FP16}$ in [BitDistiller](https://arxiv.org/abs/2402.10631), the $W_{INT2}A_{INT8}$ in [BitNet-b1.58](https://arxiv.org/abs/2402.17764). BitBLAS is based on techniques from our accepted submission "Ladder: Enabling Efficient Low-Precision Deep Learning Computing through Hardware-aware Tensor Transformation" at OSDI'24.
 
 
 Some of the key features of BitBLAS include:
@@ -88,6 +88,20 @@ We are continuously expanding the support matrix. If you have any specific requi
 - [Customization](https://github.com/microsoft/BitBLAS/blob/main/docs/ExtendOperatorsWithDSL.md): BitBLAS supports implementing customized mixed-precision DNN operations rather than matrix multiplication with the flexible DSL (TIR Script).
 
 
+## Reference
+
+Please cite BitBLAS/Ladder in your publications if it helps your research:
+```tex
+@inproceedings {ladder-osdi24,
+author = {Lei Wang and Lingxiao Ma and Shijie Cao and Quanlu Zhang and Jilong Xue and Yining Shi and Ningxin Zheng and Ziming Miao and Fan Yang and Ting Cao and Yuqing Yang and Mao Yang},
+title = {Ladder: Enabling Efficient Low-Precision Deep Learning Computing through Hardware-aware Tensor Transformation},
+booktitle = {18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)},
+year = {2024},
+url = {https://www.usenix.org/conference/osdi24/presentation/wang-lei},
+}
+```
+
+
 ## Contributing
 
 This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.

From da9695a116fedd0ad9a1921b22ace39c81c7cf03 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Thu, 6 Jun 2024 14:32:35 +0800
Subject: [PATCH 2/4] [Dev] Fix GEMV Dynamic Scheduling with Splitk (#52)

* improve e4m3 decoding.

* append fp16xint1

* Update submodule commit reference

* chore: Update shared memory scope for float32 output dtype

* BUGFIX: UINT8/INT8 Decoding

* feat: Add rasterization options for roller module

* Refactor tensorcore_legalization method to optimize tensor core usage

* feat: Add function to collect variables from expression, improve for splitk

* chore: Update typing import in __init__.py

* chore: Refactor CPU execution of operators

* Refactor matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* chore: Update version to 0.0.1.dev8

* chore: Enable debug output in bitblas.set_debug_level()

* Refactor Linear module matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* Refactor CUDA kernel launch string for dynamic symbolic set

* Bumpt version to v0.0.1.dev9

* Refactor CUDA kernel launch string for dynamic symbolic set

* Bump version to v0.0.1.dev10

---------

Co-authored-by: LeiWang199 <leiwang199>
---
 VERSION                               |  2 +-
 python/bitblas/__init__.py            |  2 +-
 python/bitblas/gpu/gemv.py            |  3 ++-
 python/bitblas/gpu/gemv_dequantize.py | 13 +++++++++++++
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/VERSION b/VERSION
index c27a65ce..2e60e791 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.0.1.dev9
\ No newline at end of file
+0.0.1.dev10
\ No newline at end of file
diff --git a/python/bitblas/__init__.py b/python/bitblas/__init__.py
index df2c6319..8100da00 100644
--- a/python/bitblas/__init__.py
+++ b/python/bitblas/__init__.py
@@ -81,4 +81,4 @@ def _init_logger():
 
 _init_logger()
 
-__version__ = "0.0.1.dev9"
+__version__ = "0.0.1.dev10"
diff --git a/python/bitblas/gpu/gemv.py b/python/bitblas/gpu/gemv.py
index 7a2880ed..7b08179d 100644
--- a/python/bitblas/gpu/gemv.py
+++ b/python/bitblas/gpu/gemv.py
@@ -775,7 +775,8 @@ def apply_config(  # pylint: disable=too-many-locals,missing-docstring
             return None
 
         block_info = block_infos[0]
-        if len(block_info.iters) not in [2, 3]:
+        if len(block_info.iters) not in [2, 3, 4]:
+            # either [SK, B, S, R] = [SK, B, S, R] * [SK, B, R]
             # either [B, S, R] = [B, S, R] * [B, R]
             # or [S, R] = [S, R] * [R]
             return None
diff --git a/python/bitblas/gpu/gemv_dequantize.py b/python/bitblas/gpu/gemv_dequantize.py
index 47e4bf42..5a6405f5 100644
--- a/python/bitblas/gpu/gemv_dequantize.py
+++ b/python/bitblas/gpu/gemv_dequantize.py
@@ -110,6 +110,11 @@ def get_vectorize_factor(target_format):
         if len(sch.get_loops(block_b)) == 3:
             i = sch.get_loops(block_b)[0]
             sch.bind(i, "blockIdx.z")
+        elif len(sch.get_loops(block_b)) == 4:
+            # splitk case
+            sk, i = sch.get_loops(block_b)[:2]
+            sch.bind(sk, "blockIdx.y")
+            sch.bind(i, "blockIdx.z")
 
         # get target dequantize buffer's idx
         def get_idx(weight_decode_info: Dict):
@@ -274,6 +279,14 @@ def get_vectorize_factor(target_format):
         if len(sch.get_loops(block_b)) == 3:
             i = sch.get_loops(block_b)[0]
             sch.bind(i, "blockIdx.z")
+        elif len(sch.get_loops(block_b)) == 4:
+            # splitk case
+            sk, i = sch.get_loops(block_b)[:2]
+            sch.bind(sk, "blockIdx.y")
+            sch.bind(i, "blockIdx.z")
+            assert len(config.thread) == 2, "SplitK only support 2D thread config"
+            num_warps = int(num_warps // config.thread[0])
+
 
         # get target dequantize buffer's idx
         def get_idx(weight_decode_info: Dict):

From 1057b07a3a8cbf46acf8083a81a29adcea361b0e Mon Sep 17 00:00:00 2001
From: Lingxiao Ma <lingxiao.ma@microsoft.com>
Date: Thu, 6 Jun 2024 06:43:22 +0000
Subject: [PATCH 3/4] update README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 0a5af379..f112989c 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,8 @@ We are continuously expanding the support matrix. If you have any specific requi
   - ```bitblas.Matmul``` implements the $W_{wdtype}A_{adtype}$ mixed-precision matrix multiplication of $C_{cdtype}[M, N] = A_{adtype}[M, K] \times W_{wdtype}[N, K]$.
   - ```bitblas.Linear``` is a PyTorch ```nn.Linear```-like module to support a Linear of mixed-precision.
 
+- [Python API](https://github.com/microsoft/BitBLAS/blob/main/docs/PythonAPI.md): The Python API doc of BitBLAS.
+
 - [Integration](https://github.com/microsoft/BitBLAS/tree/main/integration): Explore how BitBLAS seamlessly integrates with LLM deployment frameworks through our examples. Discover the ease of integrating BitBLAS with PyTorch, AutoGPTQ, and vLLM in the 3rd-party integration examples.
 
 - [Customization](https://github.com/microsoft/BitBLAS/blob/main/docs/ExtendOperatorsWithDSL.md): BitBLAS supports implementing customized mixed-precision DNN operations rather than matrix multiplication with the flexible DSL (TIR Script).

From 857732b896808584401f30786073cb39647a2ac6 Mon Sep 17 00:00:00 2001
From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
Date: Thu, 6 Jun 2024 21:27:27 +0800
Subject: [PATCH 4/4] [BugFix] Fix a bug in Static shape build (#53)

* improve e4m3 decoding.

* append fp16xint1

* Update submodule commit reference

* chore: Update shared memory scope for float32 output dtype

* BUGFIX: UINT8/INT8 Decoding

* feat: Add rasterization options for roller module

* Refactor tensorcore_legalization method to optimize tensor core usage

* feat: Add function to collect variables from expression, improve for splitk

* chore: Update typing import in __init__.py

* chore: Refactor CPU execution of operators

* Refactor matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* chore: Update version to 0.0.1.dev8

* chore: Enable debug output in bitblas.set_debug_level()

* Refactor Linear module matmul implementation for splitk layout

* Refactor matmul implementation for splitk layout

* Refactor CUDA kernel launch string for dynamic symbolic set

* Bumpt version to v0.0.1.dev9

* Refactor CUDA kernel launch string for dynamic symbolic set

* Bump version to v0.0.1.dev10

* Refactor CUDA kernel launch string for dynamic symbolic set

---------

Co-authored-by: LeiWang199 <leiwang199>
---
 python/bitblas/wrapper/general.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/bitblas/wrapper/general.py b/python/bitblas/wrapper/general.py
index f1c8fa8c..6c34f6e4 100644
--- a/python/bitblas/wrapper/general.py
+++ b/python/bitblas/wrapper/general.py
@@ -287,7 +287,10 @@ def legalize_c(p):
         # Determine the shared memory size, defaulting to 0 if not specified
         smem_str = 0 if self.dynamic_smem_buf is None else self.dynamic_smem_buf
         # Format the CUDA kernel launch string
-        call_str = "if ({} == 0) return; \n\t\t".format(list(dynamic_symbolic_set)[0])
+        if len(dynamic_symbolic_set) != 0:
+            call_str = "if ({} == 0) return; \n\t\t".format(list(dynamic_symbolic_set)[0])
+        else:
+            call_str = ""
         call_str += "{}<<<{}, {}, {}, stream>>>({});".format(function_name, grid_str, block_str, smem_str,
                                                        call_args)
         # Create the host function wrapper for the CUDA kernel