[DEV] Remove extra dependencies and refactor the tvm import. (#16)

* Update dependabot.yml * remove dependency.
microsoft · Apr 16, 2024 · 8f2fff4 · 8f2fff4
1 parent 93eef17
commit 8f2fff4
Show file tree

Hide file tree

Showing 16 changed files with 114 additions and 100 deletions.
diff --git a/benchmark/dsl/convolution.py b/benchmark/dsl/convolution.py
@@ -103,7 +103,7 @@ def conv2d_nhwc_hwio(n, f, h, w, c, kh, kw, s, d, p, in_dtype="float16", out_dty
     timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5)
     t = timer_cuda_mod(*profile_tensors).mean
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {

diff --git a/benchmark/dsl/matmul.py b/benchmark/dsl/matmul.py
@@ -17,7 +17,7 @@
 
 # fmt:off
 test_shapes = [
-    # (prim_func, input_args, default_dlight_schedule),
+    # (prim_func, input_args, default_bitblas_schedule),
     (matmul_nt, (1024, 1024, 1024, "float16", "float16"), Matmul),
     (matmul_nt, (16, 8192, 8192, "float16", "float16"), Matmul),
     (matmul_nt, (32, 8192, 8192, "float16", "float16"), Matmul),
@@ -113,15 +113,15 @@
     timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5)
     t = timer_cuda_mod(*profile_tensors).mean
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency * 1e3,
-            "fast_dlight_top20_latency": best.latency * 1e3,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3,
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency * 1e3,
+            "fast_bitblas_top20_latency": best.latency * 1e3,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3,
         }
     }
 
@@ -151,10 +151,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3f} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3f} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/matmul_dequantize_af.py b/benchmark/dsl/matmul_dequantize_af.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -12,6 +11,7 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_a_propagate_b,
 )
+import tvm
 import time
 import argparse
 
@@ -181,15 +181,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -219,10 +219,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/matmul_dequantize_fp.py b/benchmark/dsl/matmul_dequantize_fp.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -12,6 +11,7 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_a_propagate_b,
 )
+import tvm
 import time
 import argparse
 
@@ -180,15 +180,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -218,10 +218,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/matmul_dequantize_int1.py b/benchmark/dsl/matmul_dequantize_int1.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -12,9 +11,10 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_a_propagate_b,
 )
+import tvm
 import time
 import argparse
-
+bitblas.set_log_level("DEBUG")
 # append a parser for the benchmark set
 
 parser = argparse.ArgumentParser(description="Benchmark BitBLAS int8xint1 on a specific target.")
@@ -50,7 +50,7 @@
 
 llm_int8xint1 = [
     # square test
-    (matmul_nt_dequantize_b, (1, 16384, 16384, "int8", "int8", "int32", 1, "int8", "uint", False,
+    (matmul_nt_dequantize_b, (1, 16384, 16384, "int8", "int8", "int32", 1, "int8", "int", False,
                               False, group_size, True, False), Matmul),
     # BLOOM-176B
     (matmul_nt_dequantize_b, (1, 43008, 14336, "int8", "int8", "int32", 1, "int8", "uint", False,
@@ -150,6 +150,8 @@
     tune_start = time.time()
     cpresults, best = apply_and_build(func, configs, arch, parallel_build=True)
     fast_tune_time = time.time() - tune_start
+    # print(best.sch.mod)
+    print(best.code)
     print("[BitBLAS] The best latency of top 1 is {:.3f} ms".format(cpresults[0].latency))
     print("[BitBLAS] The best latency of top 20 is {:.3f} ms".format(best.latency))
 
@@ -183,15 +185,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -221,10 +223,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/matmul_dequantize_int4.py b/benchmark/dsl/matmul_dequantize_int4.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -12,6 +11,7 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_a_propagate_b,
 )
+import tvm
 import time
 import argparse
 
@@ -250,15 +250,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -288,10 +288,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/weight_propagate.py b/benchmark/dsl/weight_propagate.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -17,6 +16,7 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_b,
 )
+import tvm
 import time
 import argparse
 
@@ -512,15 +512,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -550,10 +550,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/python/bitblas/base/roller/bestfit.py b/python/bitblas/base/roller/bestfit.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-"""Benifit For Dlight Schedule"""
+"""Benifit For BitBLAS Schedule"""
 class Block:
     def __init__(self, start, end, is_free):
         self.start = start

diff --git a/python/bitblas/base/roller/policy/default.py b/python/bitblas/base/roller/policy/default.py
@@ -20,7 +20,7 @@
 class DefaultPolicy:
     """
     Default Policy for fastdlight, a heuristic plan that tries to
-    minimize memory traffic and maximize parallelism.for Dlight Schedule.
+    minimize memory traffic and maximize parallelism.for BitBLAS Schedule.
     """
 
     def __init__(self,

diff --git a/python/bitblas/gpu/intrin/lop3.py b/python/bitblas/gpu/intrin/lop3.py
@@ -565,7 +565,9 @@
 decode_i1s_to_i8s = """template <typename T1, typename T2>
 __device__ void decode_i1s_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
 {
-    int *i8s = reinterpret_cast<int *>(_i8s);
+    int i8s[4];
+    // vector load
+    *reinterpret_cast<int4 *>(i8s) = *reinterpret_cast<int4 *>(_i8s);
     int16_t i1b_i16 = *reinterpret_cast<int16_t *>(_i1b);
     // permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15}
     // into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x}
@@ -577,16 +579,17 @@
     static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
     static constexpr uint BOTTOM_MASK = 0x01010101;      // 0x1 -> 0b01 select 0,1
     static constexpr uint I8s_MAGIC_NUM = 0x00000000;
-    static constexpr uint MEDIAN_NUM = 0x00000000;
-    static constexpr uint TRANSFORM_SUBTRACT = 0x01010101;
+    static constexpr uint TRANSFORM_SUBTRACT = 0xffffffff; // for signed int 2x - 1
 
     for (int i = 0; i < N / 4; i++)
     {
         asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
                      : "=r"(i8s[i])
                      : "r"(i1b >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
-        i8s[i] = __vsubss4(__vaddss4(i8s[i], i8s[i]), TRANSFORM_SUBTRACT);
+        i8s[i] = __vadd4(i8s[i], i8s[i]);
+        i8s[i] = __vadd4(i8s[i], TRANSFORM_SUBTRACT);
     }
+    *reinterpret_cast<int4 *>(_i8s) = *reinterpret_cast<int4 *>(i8s);
 }
 
 template <typename T1, typename T2>