diff --git a/benchmark/dsl/convolution.py b/benchmark/dsl/convolution.py
index 3d9b5ac8..bf02a41d 100644
--- a/benchmark/dsl/convolution.py
+++ b/benchmark/dsl/convolution.py
@@ -103,7 +103,7 @@ def conv2d_nhwc_hwio(n, f, h, w, c, kh, kw, s, d, p, in_dtype="float16", out_dty
     timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5)
     t = timer_cuda_mod(*profile_tensors).mean
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
diff --git a/benchmark/dsl/matmul.py b/benchmark/dsl/matmul.py
index 85068b1e..85b9374e 100644
--- a/benchmark/dsl/matmul.py
+++ b/benchmark/dsl/matmul.py
@@ -17,7 +17,7 @@
 
 # fmt:off
 test_shapes = [
-    # (prim_func, input_args, default_dlight_schedule),
+    # (prim_func, input_args, default_bitblas_schedule),
     (matmul_nt, (1024, 1024, 1024, "float16", "float16"), Matmul),
     (matmul_nt, (16, 8192, 8192, "float16", "float16"), Matmul),
     (matmul_nt, (32, 8192, 8192, "float16", "float16"), Matmul),
@@ -113,15 +113,15 @@
     timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5)
     t = timer_cuda_mod(*profile_tensors).mean
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency * 1e3,
-            "fast_dlight_top20_latency": best.latency * 1e3,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3,
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency * 1e3,
+            "fast_bitblas_top20_latency": best.latency * 1e3,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3,
         }
     }
 
@@ -151,10 +151,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3f} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3f} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/matmul_dequantize_af.py b/benchmark/dsl/matmul_dequantize_af.py
index e370de3c..5bc8362a 100644
--- a/benchmark/dsl/matmul_dequantize_af.py
+++ b/benchmark/dsl/matmul_dequantize_af.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -12,6 +11,7 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_a_propagate_b,
 )
+import tvm
 import time
 import argparse
 
@@ -181,15 +181,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -219,10 +219,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/matmul_dequantize_fp.py b/benchmark/dsl/matmul_dequantize_fp.py
index 66774aaf..102ba978 100644
--- a/benchmark/dsl/matmul_dequantize_fp.py
+++ b/benchmark/dsl/matmul_dequantize_fp.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -12,6 +11,7 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_a_propagate_b,
 )
+import tvm
 import time
 import argparse
 
@@ -180,15 +180,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -218,10 +218,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/matmul_dequantize_int1.py b/benchmark/dsl/matmul_dequantize_int1.py
index 1f8b1775..8874c37f 100644
--- a/benchmark/dsl/matmul_dequantize_int1.py
+++ b/benchmark/dsl/matmul_dequantize_int1.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -12,9 +11,10 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_a_propagate_b,
 )
+import tvm
 import time
 import argparse
-
+bitblas.set_log_level("DEBUG")
 # append a parser for the benchmark set
 
 parser = argparse.ArgumentParser(description="Benchmark BitBLAS int8xint1 on a specific target.")
@@ -50,7 +50,7 @@
 
 llm_int8xint1 = [
     # square test
-    (matmul_nt_dequantize_b, (1, 16384, 16384, "int8", "int8", "int32", 1, "int8", "uint", False,
+    (matmul_nt_dequantize_b, (1, 16384, 16384, "int8", "int8", "int32", 1, "int8", "int", False,
                               False, group_size, True, False), Matmul),
     # BLOOM-176B
     (matmul_nt_dequantize_b, (1, 43008, 14336, "int8", "int8", "int32", 1, "int8", "uint", False,
@@ -150,6 +150,8 @@
     tune_start = time.time()
     cpresults, best = apply_and_build(func, configs, arch, parallel_build=True)
     fast_tune_time = time.time() - tune_start
+    # print(best.sch.mod)
+    print(best.code)
     print("[BitBLAS] The best latency of top 1 is {:.3f} ms".format(cpresults[0].latency))
     print("[BitBLAS] The best latency of top 20 is {:.3f} ms".format(best.latency))
 
@@ -183,15 +185,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -221,10 +223,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/matmul_dequantize_int4.py b/benchmark/dsl/matmul_dequantize_int4.py
index 7eb6cf8d..f8c57755 100644
--- a/benchmark/dsl/matmul_dequantize_int4.py
+++ b/benchmark/dsl/matmul_dequantize_int4.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -12,6 +11,7 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_a_propagate_b,
 )
+import tvm
 import time
 import argparse
 
@@ -250,15 +250,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -288,10 +288,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/benchmark/dsl/weight_propagate.py b/benchmark/dsl/weight_propagate.py
index e69310af..aab5316c 100644
--- a/benchmark/dsl/weight_propagate.py
+++ b/benchmark/dsl/weight_propagate.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import tvm
 import bitblas
 from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
 from bitblas.base.roller.arch import CUDA
@@ -17,6 +16,7 @@
     matmul_nt_dequantize_b,
     matmul_nt_dequantize_b_propagate_b,
 )
+import tvm
 import time
 import argparse
 
@@ -512,15 +512,15 @@
     else:
         t = 1e4 - 1
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency,
-            "fast_dlight_top20_latency": best.latency,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3 if t is not None else "Failed",
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency,
+            "fast_bitblas_top20_latency": best.latency,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3 if t is not None else "Failed",
         }
     }
 
@@ -550,10 +550,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3e} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3e} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/python/bitblas/base/roller/bestfit.py b/python/bitblas/base/roller/bestfit.py
index b0e541c6..ad8ec20a 100644
--- a/python/bitblas/base/roller/bestfit.py
+++ b/python/bitblas/base/roller/bestfit.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-"""Benifit For Dlight Schedule"""
+"""Benifit For BitBLAS Schedule"""
 class Block:
     def __init__(self, start, end, is_free):
         self.start = start
diff --git a/python/bitblas/base/roller/policy/default.py b/python/bitblas/base/roller/policy/default.py
index 5526e131..81aeba12 100644
--- a/python/bitblas/base/roller/policy/default.py
+++ b/python/bitblas/base/roller/policy/default.py
@@ -20,7 +20,7 @@
 class DefaultPolicy:
     """
     Default Policy for fastdlight, a heuristic plan that tries to
-    minimize memory traffic and maximize parallelism.for Dlight Schedule.
+    minimize memory traffic and maximize parallelism.for BitBLAS Schedule.
     """
 
     def __init__(self,
diff --git a/python/bitblas/gpu/intrin/lop3.py b/python/bitblas/gpu/intrin/lop3.py
index bad04ceb..43c91a64 100644
--- a/python/bitblas/gpu/intrin/lop3.py
+++ b/python/bitblas/gpu/intrin/lop3.py
@@ -565,7 +565,9 @@
 decode_i1s_to_i8s = """template <typename T1, typename T2>
 __device__ void decode_i1s_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
 {
-    int *i8s = reinterpret_cast<int *>(_i8s);
+    int i8s[4];
+    // vector load
+    *reinterpret_cast<int4 *>(i8s) = *reinterpret_cast<int4 *>(_i8s);
     int16_t i1b_i16 = *reinterpret_cast<int16_t *>(_i1b);
     // permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15}
     // into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x}
@@ -577,16 +579,17 @@
     static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
     static constexpr uint BOTTOM_MASK = 0x01010101;      // 0x1 -> 0b01 select 0,1
     static constexpr uint I8s_MAGIC_NUM = 0x00000000;
-    static constexpr uint MEDIAN_NUM = 0x00000000;
-    static constexpr uint TRANSFORM_SUBTRACT = 0x01010101;
+    static constexpr uint TRANSFORM_SUBTRACT = 0xffffffff; // for signed int 2x - 1
 
     for (int i = 0; i < N / 4; i++)
     {
         asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
                      : "=r"(i8s[i])
                      : "r"(i1b >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
-        i8s[i] = __vsubss4(__vaddss4(i8s[i], i8s[i]), TRANSFORM_SUBTRACT);
+        i8s[i] = __vadd4(i8s[i], i8s[i]);
+        i8s[i] = __vadd4(i8s[i], TRANSFORM_SUBTRACT);
     }
+    *reinterpret_cast<int4 *>(_i8s) = *reinterpret_cast<int4 *>(i8s);
 }
 
 template <typename T1, typename T2>
diff --git a/testing/cpp/lop3_type_conversion/fast_decoding.hpp b/testing/cpp/lop3_type_conversion/fast_decoding.hpp
index ac24d40a..184dfa24 100644
--- a/testing/cpp/lop3_type_conversion/fast_decoding.hpp
+++ b/testing/cpp/lop3_type_conversion/fast_decoding.hpp
@@ -712,7 +712,9 @@ __device__ void decode_i2u_to_i8s(T1 *_i2u, T2 *B_local_decode, const int N = 16
 template <typename T1, typename T2, bool isSigned>
 __device__ void decode_i1b_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
 {
-    int *i8s = reinterpret_cast<int *>(_i8s);
+    int i8s[4];
+    // vector load
+    *reinterpret_cast<int4 *>(i8s) = *reinterpret_cast<int4 *>(_i8s);
     int16_t i1b_i16 = *reinterpret_cast<int16_t *>(_i1b);
     // permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15}
     // into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x}
@@ -724,7 +726,7 @@ __device__ void decode_i1b_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
     static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
     static constexpr uint BOTTOM_MASK = 0x01010101;      // 0x1 -> 0b01 select 0,1
     static constexpr uint I8s_MAGIC_NUM = 0x00000000;
-    static constexpr uint TRANSFORM_SUBTRACT = 0x01010101;
+    static constexpr uint TRANSFORM_SUBTRACT = 0xffffffff; // for signed int 2x - 1
     
     for (int i = 0; i < N / 4; i++)
     {
@@ -734,9 +736,15 @@ __device__ void decode_i1b_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
 
         if constexpr (isSigned)
         {
-            i8s[i] = __vsubss4(__vaddss4(i8s[i], i8s[i]), TRANSFORM_SUBTRACT);
+            int _i8s = i8s[i];
+            int tmp = 	__vcmpleu4(_i8s, 0);
+            _i8s |= tmp;
+            i8s[i] = _i8s;
+            //             // i8s[i] = __vadd4(__vadd4(i8s[i], i8s[i]), TRANSFORM_SUBTRACT);
         }
     }
+    // vector store
+    *reinterpret_cast<int4 *>(_i8s) = *reinterpret_cast<int4 *>(i8s);
 }
 
 template <typename T1, typename T2>
diff --git a/testing/python/dsl/test_auto_normalized_tensorcore.py b/testing/python/dsl/test_auto_normalized_tensorcore.py
index c7c9a723..eb6e0bae 100644
--- a/testing/python/dsl/test_auto_normalized_tensorcore.py
+++ b/testing/python/dsl/test_auto_normalized_tensorcore.py
@@ -11,7 +11,7 @@
 import time
 
 benchmark_sets = [
-    # (prim_func, input_args, default_dlight_schedule),
+    # (prim_func, input_args, default_bitblas_schedule),
     (conv2d_nhwc_hwio, (128, 64, 224, 224, 3, 7, 7, 2, 1, 3, "float16", "float16"), Matmul),
     (conv2d_nhwc_ohwi, (128, 64, 56, 56, 64, 3, 3, 1, 1, 1, "float16", "float16"), Matmul),
     (conv2d_nhwc_hwio, (128, 64, 56, 56, 64, 1, 1, 1, 1, 1, "float16", "float16"), Matmul),
@@ -66,15 +66,15 @@
     timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5)
     t = timer_cuda_mod(*profile_tensors).mean
 
-    print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
+    print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))
 
     profile_config = {
         f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
-            "fast_dlight_top20_tune_time": fast_tune_time,
-            "fast_dlight_top1_latency": cpresults[0].latency * 1e3,
-            "fast_dlight_top20_latency": best.latency * 1e3,
-            "default_dlight_tune_time": default_tune_time,
-            "default_dlight_latency": t * 1e3,
+            "fast_bitblas_top20_tune_time": fast_tune_time,
+            "fast_bitblas_top1_latency": cpresults[0].latency * 1e3,
+            "fast_bitblas_top20_latency": best.latency * 1e3,
+            "default_bitblas_tune_time": default_tune_time,
+            "default_bitblas_latency": t * 1e3,
         }
     }
     benchmark_results.update(profile_config)
@@ -103,10 +103,10 @@
     row = [
         func_name,
         input_args,
-        f" {str(values['fast_dlight_top20_tune_time'])} s",
-        f"{values['fast_dlight_top1_latency']:.3f} ms",
-        f"{values['fast_dlight_top20_latency']:.3f} ms",
-        str(values["default_dlight_tune_time"]),
-        f"{values['default_dlight_latency']:.3f} ms",
+        f" {str(values['fast_bitblas_top20_tune_time'])} s",
+        f"{values['fast_bitblas_top1_latency']:.3f} ms",
+        f"{values['fast_bitblas_top20_latency']:.3f} ms",
+        str(values["default_bitblas_tune_time"]),
+        f"{values['default_bitblas_latency']:.3f} ms",
     ]
     print("".join(word.ljust(col_width) for word in row))
diff --git a/testing/python/operators/test_ladder_permutate_ops.py b/testing/python/operators/test_ladder_permutate_ops.py
index e14b5c50..ed8586e0 100644
--- a/testing/python/operators/test_ladder_permutate_ops.py
+++ b/testing/python/operators/test_ladder_permutate_ops.py
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import pytest
-import tvm
 import bitblas
 from bitblas.ops.ladder_permutate import LadderPermutate, LadderPermutateConfig
+import tvm
 
 target = tvm.target.Target("llvm")
 
diff --git a/testing/python/operators/test_lop3_permutate_ops.py b/testing/python/operators/test_lop3_permutate_ops.py
index 31017d91..55dde117 100644
--- a/testing/python/operators/test_lop3_permutate_ops.py
+++ b/testing/python/operators/test_lop3_permutate_ops.py
@@ -1,10 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import pytest
-import tvm
 import bitblas
 from bitblas.ops.lop3_permutate import LOP3Permutate, LOP3PermutateConfig
 
+import tvm
 target = tvm.target.Target("llvm")
 
 # fmt: off
diff --git a/testing/python/operators/test_matmul_dequantize_ops.py b/testing/python/operators/test_matmul_dequantize_ops.py
index a4a48f26..dddafc98 100644
--- a/testing/python/operators/test_matmul_dequantize_ops.py
+++ b/testing/python/operators/test_matmul_dequantize_ops.py
@@ -1,13 +1,13 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import pytest
-import tvm
 import bitblas
 from bitblas.utils import auto_detect_nvidia_target
 from bitblas.ops.matmul_dequantize import (
     MatmulWeightOnlyDequantize,
     MatmulWeightOnlyDequantizeConfig,
 )
+import tvm
 import logging
 from bitblas import set_log_level
 
diff --git a/testing/python/operators/test_param_permutate_ops.py b/testing/python/operators/test_param_permutate_ops.py
index 4f8d32c8..9149c15e 100644
--- a/testing/python/operators/test_param_permutate_ops.py
+++ b/testing/python/operators/test_param_permutate_ops.py
@@ -1,10 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import pytest
-import tvm
 import bitblas
 from bitblas.ops.param_permutate import ParamPermutate, ParamPermutateConfig
 
+import tvm
+
 target = tvm.target.Target("llvm")