diff --git a/benchmark/dsl/convolution.py b/benchmark/dsl/convolution.py index 3d9b5ac8..bf02a41d 100644 --- a/benchmark/dsl/convolution.py +++ b/benchmark/dsl/convolution.py @@ -103,7 +103,7 @@ def conv2d_nhwc_hwio(n, f, h, w, c, kh, kw, s, d, p, in_dtype="float16", out_dty timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5) t = timer_cuda_mod(*profile_tensors).mean - print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3)) + print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3)) profile_config = { f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": { diff --git a/benchmark/dsl/matmul.py b/benchmark/dsl/matmul.py index 85068b1e..85b9374e 100644 --- a/benchmark/dsl/matmul.py +++ b/benchmark/dsl/matmul.py @@ -17,7 +17,7 @@ # fmt:off test_shapes = [ - # (prim_func, input_args, default_dlight_schedule), + # (prim_func, input_args, default_bitblas_schedule), (matmul_nt, (1024, 1024, 1024, "float16", "float16"), Matmul), (matmul_nt, (16, 8192, 8192, "float16", "float16"), Matmul), (matmul_nt, (32, 8192, 8192, "float16", "float16"), Matmul), @@ -113,15 +113,15 @@ timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5) t = timer_cuda_mod(*profile_tensors).mean - print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3)) + print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3)) profile_config = { f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": { - "fast_dlight_top20_tune_time": fast_tune_time, - "fast_dlight_top1_latency": cpresults[0].latency * 1e3, - "fast_dlight_top20_latency": best.latency * 1e3, - "default_dlight_tune_time": default_tune_time, - "default_dlight_latency": t * 1e3, + "fast_bitblas_top20_tune_time": fast_tune_time, + "fast_bitblas_top1_latency": cpresults[0].latency * 1e3, + "fast_bitblas_top20_latency": best.latency * 1e3, + "default_bitblas_tune_time": default_tune_time, + "default_bitblas_latency": t * 1e3, } } @@ -151,10 +151,10 @@ row = [ func_name, input_args, - f" {str(values['fast_dlight_top20_tune_time'])} s", - f"{values['fast_dlight_top1_latency']:.3f} ms", - f"{values['fast_dlight_top20_latency']:.3f} ms", - str(values["default_dlight_tune_time"]), - f"{values['default_dlight_latency']:.3f} ms", + f" {str(values['fast_bitblas_top20_tune_time'])} s", + f"{values['fast_bitblas_top1_latency']:.3f} ms", + f"{values['fast_bitblas_top20_latency']:.3f} ms", + str(values["default_bitblas_tune_time"]), + f"{values['default_bitblas_latency']:.3f} ms", ] print("".join(word.ljust(col_width) for word in row)) diff --git a/benchmark/dsl/matmul_dequantize_af.py b/benchmark/dsl/matmul_dequantize_af.py index e370de3c..5bc8362a 100644 --- a/benchmark/dsl/matmul_dequantize_af.py +++ b/benchmark/dsl/matmul_dequantize_af.py @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import tvm import bitblas from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy from bitblas.base.roller.arch import CUDA @@ -12,6 +11,7 @@ matmul_nt_dequantize_b, matmul_nt_dequantize_b_propagate_a_propagate_b, ) +import tvm import time import argparse @@ -181,15 +181,15 @@ else: t = 1e4 - 1 - print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3)) + print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3)) profile_config = { f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": { - "fast_dlight_top20_tune_time": fast_tune_time, - "fast_dlight_top1_latency": cpresults[0].latency, - "fast_dlight_top20_latency": best.latency, - "default_dlight_tune_time": default_tune_time, - "default_dlight_latency": t * 1e3 if t is not None else "Failed", + "fast_bitblas_top20_tune_time": fast_tune_time, + "fast_bitblas_top1_latency": cpresults[0].latency, + "fast_bitblas_top20_latency": best.latency, + "default_bitblas_tune_time": default_tune_time, + "default_bitblas_latency": t * 1e3 if t is not None else "Failed", } } @@ -219,10 +219,10 @@ row = [ func_name, input_args, - f" {str(values['fast_dlight_top20_tune_time'])} s", - f"{values['fast_dlight_top1_latency']:.3f} ms", - f"{values['fast_dlight_top20_latency']:.3f} ms", - str(values["default_dlight_tune_time"]), - f"{values['default_dlight_latency']:.3e} ms", + f" {str(values['fast_bitblas_top20_tune_time'])} s", + f"{values['fast_bitblas_top1_latency']:.3f} ms", + f"{values['fast_bitblas_top20_latency']:.3f} ms", + str(values["default_bitblas_tune_time"]), + f"{values['default_bitblas_latency']:.3e} ms", ] print("".join(word.ljust(col_width) for word in row)) diff --git a/benchmark/dsl/matmul_dequantize_fp.py b/benchmark/dsl/matmul_dequantize_fp.py index 66774aaf..102ba978 100644 --- a/benchmark/dsl/matmul_dequantize_fp.py +++ b/benchmark/dsl/matmul_dequantize_fp.py @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import tvm import bitblas from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy from bitblas.base.roller.arch import CUDA @@ -12,6 +11,7 @@ matmul_nt_dequantize_b, matmul_nt_dequantize_b_propagate_a_propagate_b, ) +import tvm import time import argparse @@ -180,15 +180,15 @@ else: t = 1e4 - 1 - print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3)) + print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3)) profile_config = { f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": { - "fast_dlight_top20_tune_time": fast_tune_time, - "fast_dlight_top1_latency": cpresults[0].latency, - "fast_dlight_top20_latency": best.latency, - "default_dlight_tune_time": default_tune_time, - "default_dlight_latency": t * 1e3 if t is not None else "Failed", + "fast_bitblas_top20_tune_time": fast_tune_time, + "fast_bitblas_top1_latency": cpresults[0].latency, + "fast_bitblas_top20_latency": best.latency, + "default_bitblas_tune_time": default_tune_time, + "default_bitblas_latency": t * 1e3 if t is not None else "Failed", } } @@ -218,10 +218,10 @@ row = [ func_name, input_args, - f" {str(values['fast_dlight_top20_tune_time'])} s", - f"{values['fast_dlight_top1_latency']:.3f} ms", - f"{values['fast_dlight_top20_latency']:.3f} ms", - str(values["default_dlight_tune_time"]), - f"{values['default_dlight_latency']:.3e} ms", + f" {str(values['fast_bitblas_top20_tune_time'])} s", + f"{values['fast_bitblas_top1_latency']:.3f} ms", + f"{values['fast_bitblas_top20_latency']:.3f} ms", + str(values["default_bitblas_tune_time"]), + f"{values['default_bitblas_latency']:.3e} ms", ] print("".join(word.ljust(col_width) for word in row)) diff --git a/benchmark/dsl/matmul_dequantize_int1.py b/benchmark/dsl/matmul_dequantize_int1.py index 1f8b1775..8874c37f 100644 --- a/benchmark/dsl/matmul_dequantize_int1.py +++ b/benchmark/dsl/matmul_dequantize_int1.py @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import tvm import bitblas from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy from bitblas.base.roller.arch import CUDA @@ -12,9 +11,10 @@ matmul_nt_dequantize_b, matmul_nt_dequantize_b_propagate_a_propagate_b, ) +import tvm import time import argparse - +bitblas.set_log_level("DEBUG") # append a parser for the benchmark set parser = argparse.ArgumentParser(description="Benchmark BitBLAS int8xint1 on a specific target.") @@ -50,7 +50,7 @@ llm_int8xint1 = [ # square test - (matmul_nt_dequantize_b, (1, 16384, 16384, "int8", "int8", "int32", 1, "int8", "uint", False, + (matmul_nt_dequantize_b, (1, 16384, 16384, "int8", "int8", "int32", 1, "int8", "int", False, False, group_size, True, False), Matmul), # BLOOM-176B (matmul_nt_dequantize_b, (1, 43008, 14336, "int8", "int8", "int32", 1, "int8", "uint", False, @@ -150,6 +150,8 @@ tune_start = time.time() cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) fast_tune_time = time.time() - tune_start + # print(best.sch.mod) + print(best.code) print("[BitBLAS] The best latency of top 1 is {:.3f} ms".format(cpresults[0].latency)) print("[BitBLAS] The best latency of top 20 is {:.3f} ms".format(best.latency)) @@ -183,15 +185,15 @@ else: t = 1e4 - 1 - print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3)) + print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3)) profile_config = { f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": { - "fast_dlight_top20_tune_time": fast_tune_time, - "fast_dlight_top1_latency": cpresults[0].latency, - "fast_dlight_top20_latency": best.latency, - "default_dlight_tune_time": default_tune_time, - "default_dlight_latency": t * 1e3 if t is not None else "Failed", + "fast_bitblas_top20_tune_time": fast_tune_time, + "fast_bitblas_top1_latency": cpresults[0].latency, + "fast_bitblas_top20_latency": best.latency, + "default_bitblas_tune_time": default_tune_time, + "default_bitblas_latency": t * 1e3 if t is not None else "Failed", } } @@ -221,10 +223,10 @@ row = [ func_name, input_args, - f" {str(values['fast_dlight_top20_tune_time'])} s", - f"{values['fast_dlight_top1_latency']:.3f} ms", - f"{values['fast_dlight_top20_latency']:.3f} ms", - str(values["default_dlight_tune_time"]), - f"{values['default_dlight_latency']:.3e} ms", + f" {str(values['fast_bitblas_top20_tune_time'])} s", + f"{values['fast_bitblas_top1_latency']:.3f} ms", + f"{values['fast_bitblas_top20_latency']:.3f} ms", + str(values["default_bitblas_tune_time"]), + f"{values['default_bitblas_latency']:.3e} ms", ] print("".join(word.ljust(col_width) for word in row)) diff --git a/benchmark/dsl/matmul_dequantize_int4.py b/benchmark/dsl/matmul_dequantize_int4.py index 7eb6cf8d..f8c57755 100644 --- a/benchmark/dsl/matmul_dequantize_int4.py +++ b/benchmark/dsl/matmul_dequantize_int4.py @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import tvm import bitblas from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy from bitblas.base.roller.arch import CUDA @@ -12,6 +11,7 @@ matmul_nt_dequantize_b, matmul_nt_dequantize_b_propagate_a_propagate_b, ) +import tvm import time import argparse @@ -250,15 +250,15 @@ else: t = 1e4 - 1 - print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3)) + print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3)) profile_config = { f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": { - "fast_dlight_top20_tune_time": fast_tune_time, - "fast_dlight_top1_latency": cpresults[0].latency, - "fast_dlight_top20_latency": best.latency, - "default_dlight_tune_time": default_tune_time, - "default_dlight_latency": t * 1e3 if t is not None else "Failed", + "fast_bitblas_top20_tune_time": fast_tune_time, + "fast_bitblas_top1_latency": cpresults[0].latency, + "fast_bitblas_top20_latency": best.latency, + "default_bitblas_tune_time": default_tune_time, + "default_bitblas_latency": t * 1e3 if t is not None else "Failed", } } @@ -288,10 +288,10 @@ row = [ func_name, input_args, - f" {str(values['fast_dlight_top20_tune_time'])} s", - f"{values['fast_dlight_top1_latency']:.3f} ms", - f"{values['fast_dlight_top20_latency']:.3f} ms", - str(values["default_dlight_tune_time"]), - f"{values['default_dlight_latency']:.3e} ms", + f" {str(values['fast_bitblas_top20_tune_time'])} s", + f"{values['fast_bitblas_top1_latency']:.3f} ms", + f"{values['fast_bitblas_top20_latency']:.3f} ms", + str(values["default_bitblas_tune_time"]), + f"{values['default_bitblas_latency']:.3e} ms", ] print("".join(word.ljust(col_width) for word in row)) diff --git a/benchmark/dsl/weight_propagate.py b/benchmark/dsl/weight_propagate.py index e69310af..aab5316c 100644 --- a/benchmark/dsl/weight_propagate.py +++ b/benchmark/dsl/weight_propagate.py @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import tvm import bitblas from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy from bitblas.base.roller.arch import CUDA @@ -17,6 +16,7 @@ matmul_nt_dequantize_b, matmul_nt_dequantize_b_propagate_b, ) +import tvm import time import argparse @@ -512,15 +512,15 @@ else: t = 1e4 - 1 - print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3)) + print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3)) profile_config = { f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": { - "fast_dlight_top20_tune_time": fast_tune_time, - "fast_dlight_top1_latency": cpresults[0].latency, - "fast_dlight_top20_latency": best.latency, - "default_dlight_tune_time": default_tune_time, - "default_dlight_latency": t * 1e3 if t is not None else "Failed", + "fast_bitblas_top20_tune_time": fast_tune_time, + "fast_bitblas_top1_latency": cpresults[0].latency, + "fast_bitblas_top20_latency": best.latency, + "default_bitblas_tune_time": default_tune_time, + "default_bitblas_latency": t * 1e3 if t is not None else "Failed", } } @@ -550,10 +550,10 @@ row = [ func_name, input_args, - f" {str(values['fast_dlight_top20_tune_time'])} s", - f"{values['fast_dlight_top1_latency']:.3f} ms", - f"{values['fast_dlight_top20_latency']:.3f} ms", - str(values["default_dlight_tune_time"]), - f"{values['default_dlight_latency']:.3e} ms", + f" {str(values['fast_bitblas_top20_tune_time'])} s", + f"{values['fast_bitblas_top1_latency']:.3f} ms", + f"{values['fast_bitblas_top20_latency']:.3f} ms", + str(values["default_bitblas_tune_time"]), + f"{values['default_bitblas_latency']:.3e} ms", ] print("".join(word.ljust(col_width) for word in row)) diff --git a/python/bitblas/base/roller/bestfit.py b/python/bitblas/base/roller/bestfit.py index b0e541c6..ad8ec20a 100644 --- a/python/bitblas/base/roller/bestfit.py +++ b/python/bitblas/base/roller/bestfit.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -"""Benifit For Dlight Schedule""" +"""Benifit For BitBLAS Schedule""" class Block: def __init__(self, start, end, is_free): self.start = start diff --git a/python/bitblas/base/roller/policy/default.py b/python/bitblas/base/roller/policy/default.py index 5526e131..81aeba12 100644 --- a/python/bitblas/base/roller/policy/default.py +++ b/python/bitblas/base/roller/policy/default.py @@ -20,7 +20,7 @@ class DefaultPolicy: """ Default Policy for fastdlight, a heuristic plan that tries to - minimize memory traffic and maximize parallelism.for Dlight Schedule. + minimize memory traffic and maximize parallelism.for BitBLAS Schedule. """ def __init__(self, diff --git a/python/bitblas/gpu/intrin/lop3.py b/python/bitblas/gpu/intrin/lop3.py index bad04ceb..43c91a64 100644 --- a/python/bitblas/gpu/intrin/lop3.py +++ b/python/bitblas/gpu/intrin/lop3.py @@ -565,7 +565,9 @@ decode_i1s_to_i8s = """template __device__ void decode_i1s_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16) { - int *i8s = reinterpret_cast(_i8s); + int i8s[4]; + // vector load + *reinterpret_cast(i8s) = *reinterpret_cast(_i8s); int16_t i1b_i16 = *reinterpret_cast(_i1b); // permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15} // into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x} @@ -577,16 +579,17 @@ static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1 static constexpr uint I8s_MAGIC_NUM = 0x00000000; - static constexpr uint MEDIAN_NUM = 0x00000000; - static constexpr uint TRANSFORM_SUBTRACT = 0x01010101; + static constexpr uint TRANSFORM_SUBTRACT = 0xffffffff; // for signed int 2x - 1 for (int i = 0; i < N / 4; i++) { asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n" : "=r"(i8s[i]) : "r"(i1b >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut)); - i8s[i] = __vsubss4(__vaddss4(i8s[i], i8s[i]), TRANSFORM_SUBTRACT); + i8s[i] = __vadd4(i8s[i], i8s[i]); + i8s[i] = __vadd4(i8s[i], TRANSFORM_SUBTRACT); } + *reinterpret_cast(_i8s) = *reinterpret_cast(i8s); } template diff --git a/testing/cpp/lop3_type_conversion/fast_decoding.hpp b/testing/cpp/lop3_type_conversion/fast_decoding.hpp index ac24d40a..184dfa24 100644 --- a/testing/cpp/lop3_type_conversion/fast_decoding.hpp +++ b/testing/cpp/lop3_type_conversion/fast_decoding.hpp @@ -712,7 +712,9 @@ __device__ void decode_i2u_to_i8s(T1 *_i2u, T2 *B_local_decode, const int N = 16 template __device__ void decode_i1b_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16) { - int *i8s = reinterpret_cast(_i8s); + int i8s[4]; + // vector load + *reinterpret_cast(i8s) = *reinterpret_cast(_i8s); int16_t i1b_i16 = *reinterpret_cast(_i1b); // permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15} // into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x} @@ -724,7 +726,7 @@ __device__ void decode_i1b_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16) static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010 static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1 static constexpr uint I8s_MAGIC_NUM = 0x00000000; - static constexpr uint TRANSFORM_SUBTRACT = 0x01010101; + static constexpr uint TRANSFORM_SUBTRACT = 0xffffffff; // for signed int 2x - 1 for (int i = 0; i < N / 4; i++) { @@ -734,9 +736,15 @@ __device__ void decode_i1b_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16) if constexpr (isSigned) { - i8s[i] = __vsubss4(__vaddss4(i8s[i], i8s[i]), TRANSFORM_SUBTRACT); + int _i8s = i8s[i]; + int tmp = __vcmpleu4(_i8s, 0); + _i8s |= tmp; + i8s[i] = _i8s; + // // i8s[i] = __vadd4(__vadd4(i8s[i], i8s[i]), TRANSFORM_SUBTRACT); } } + // vector store + *reinterpret_cast(_i8s) = *reinterpret_cast(i8s); } template diff --git a/testing/python/dsl/test_auto_normalized_tensorcore.py b/testing/python/dsl/test_auto_normalized_tensorcore.py index c7c9a723..eb6e0bae 100644 --- a/testing/python/dsl/test_auto_normalized_tensorcore.py +++ b/testing/python/dsl/test_auto_normalized_tensorcore.py @@ -11,7 +11,7 @@ import time benchmark_sets = [ - # (prim_func, input_args, default_dlight_schedule), + # (prim_func, input_args, default_bitblas_schedule), (conv2d_nhwc_hwio, (128, 64, 224, 224, 3, 7, 7, 2, 1, 3, "float16", "float16"), Matmul), (conv2d_nhwc_ohwi, (128, 64, 56, 56, 64, 3, 3, 1, 1, 1, "float16", "float16"), Matmul), (conv2d_nhwc_hwio, (128, 64, 56, 56, 64, 1, 1, 1, 1, 1, "float16", "float16"), Matmul), @@ -66,15 +66,15 @@ timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5) t = timer_cuda_mod(*profile_tensors).mean - print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3)) + print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3)) profile_config = { f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": { - "fast_dlight_top20_tune_time": fast_tune_time, - "fast_dlight_top1_latency": cpresults[0].latency * 1e3, - "fast_dlight_top20_latency": best.latency * 1e3, - "default_dlight_tune_time": default_tune_time, - "default_dlight_latency": t * 1e3, + "fast_bitblas_top20_tune_time": fast_tune_time, + "fast_bitblas_top1_latency": cpresults[0].latency * 1e3, + "fast_bitblas_top20_latency": best.latency * 1e3, + "default_bitblas_tune_time": default_tune_time, + "default_bitblas_latency": t * 1e3, } } benchmark_results.update(profile_config) @@ -103,10 +103,10 @@ row = [ func_name, input_args, - f" {str(values['fast_dlight_top20_tune_time'])} s", - f"{values['fast_dlight_top1_latency']:.3f} ms", - f"{values['fast_dlight_top20_latency']:.3f} ms", - str(values["default_dlight_tune_time"]), - f"{values['default_dlight_latency']:.3f} ms", + f" {str(values['fast_bitblas_top20_tune_time'])} s", + f"{values['fast_bitblas_top1_latency']:.3f} ms", + f"{values['fast_bitblas_top20_latency']:.3f} ms", + str(values["default_bitblas_tune_time"]), + f"{values['default_bitblas_latency']:.3f} ms", ] print("".join(word.ljust(col_width) for word in row)) diff --git a/testing/python/operators/test_ladder_permutate_ops.py b/testing/python/operators/test_ladder_permutate_ops.py index e14b5c50..ed8586e0 100644 --- a/testing/python/operators/test_ladder_permutate_ops.py +++ b/testing/python/operators/test_ladder_permutate_ops.py @@ -1,9 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import pytest -import tvm import bitblas from bitblas.ops.ladder_permutate import LadderPermutate, LadderPermutateConfig +import tvm target = tvm.target.Target("llvm") diff --git a/testing/python/operators/test_lop3_permutate_ops.py b/testing/python/operators/test_lop3_permutate_ops.py index 31017d91..55dde117 100644 --- a/testing/python/operators/test_lop3_permutate_ops.py +++ b/testing/python/operators/test_lop3_permutate_ops.py @@ -1,10 +1,10 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import pytest -import tvm import bitblas from bitblas.ops.lop3_permutate import LOP3Permutate, LOP3PermutateConfig +import tvm target = tvm.target.Target("llvm") # fmt: off diff --git a/testing/python/operators/test_matmul_dequantize_ops.py b/testing/python/operators/test_matmul_dequantize_ops.py index a4a48f26..dddafc98 100644 --- a/testing/python/operators/test_matmul_dequantize_ops.py +++ b/testing/python/operators/test_matmul_dequantize_ops.py @@ -1,13 +1,13 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import pytest -import tvm import bitblas from bitblas.utils import auto_detect_nvidia_target from bitblas.ops.matmul_dequantize import ( MatmulWeightOnlyDequantize, MatmulWeightOnlyDequantizeConfig, ) +import tvm import logging from bitblas import set_log_level diff --git a/testing/python/operators/test_param_permutate_ops.py b/testing/python/operators/test_param_permutate_ops.py index 4f8d32c8..9149c15e 100644 --- a/testing/python/operators/test_param_permutate_ops.py +++ b/testing/python/operators/test_param_permutate_ops.py @@ -1,10 +1,11 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import pytest -import tvm import bitblas from bitblas.ops.param_permutate import ParamPermutate, ParamPermutateConfig +import tvm + target = tvm.target.Target("llvm")