Skip to content

Commit

Permalink
[DEV] Remove extra dependencies and refactor the tvm import. (#16)
Browse files Browse the repository at this point in the history
* Update dependabot.yml

* remove dependency.
  • Loading branch information
LeiWang1999 authored Apr 16, 2024
1 parent 16e1f99 commit 5fad509
Show file tree
Hide file tree
Showing 16 changed files with 114 additions and 100 deletions.
2 changes: 1 addition & 1 deletion benchmark/dsl/convolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def conv2d_nhwc_hwio(n, f, h, w, c, kh, kw, s, d, p, in_dtype="float16", out_dty
timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5)
t = timer_cuda_mod(*profile_tensors).mean

print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))

profile_config = {
f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
Expand Down
24 changes: 12 additions & 12 deletions benchmark/dsl/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

# fmt:off
test_shapes = [
# (prim_func, input_args, default_dlight_schedule),
# (prim_func, input_args, default_bitblas_schedule),
(matmul_nt, (1024, 1024, 1024, "float16", "float16"), Matmul),
(matmul_nt, (16, 8192, 8192, "float16", "float16"), Matmul),
(matmul_nt, (32, 8192, 8192, "float16", "float16"), Matmul),
Expand Down Expand Up @@ -113,15 +113,15 @@
timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5)
t = timer_cuda_mod(*profile_tensors).mean

print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))

profile_config = {
f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
"fast_dlight_top20_tune_time": fast_tune_time,
"fast_dlight_top1_latency": cpresults[0].latency * 1e3,
"fast_dlight_top20_latency": best.latency * 1e3,
"default_dlight_tune_time": default_tune_time,
"default_dlight_latency": t * 1e3,
"fast_bitblas_top20_tune_time": fast_tune_time,
"fast_bitblas_top1_latency": cpresults[0].latency * 1e3,
"fast_bitblas_top20_latency": best.latency * 1e3,
"default_bitblas_tune_time": default_tune_time,
"default_bitblas_latency": t * 1e3,
}
}

Expand Down Expand Up @@ -151,10 +151,10 @@
row = [
func_name,
input_args,
f" {str(values['fast_dlight_top20_tune_time'])} s",
f"{values['fast_dlight_top1_latency']:.3f} ms",
f"{values['fast_dlight_top20_latency']:.3f} ms",
str(values["default_dlight_tune_time"]),
f"{values['default_dlight_latency']:.3f} ms",
f" {str(values['fast_bitblas_top20_tune_time'])} s",
f"{values['fast_bitblas_top1_latency']:.3f} ms",
f"{values['fast_bitblas_top20_latency']:.3f} ms",
str(values["default_bitblas_tune_time"]),
f"{values['default_bitblas_latency']:.3f} ms",
]
print("".join(word.ljust(col_width) for word in row))
24 changes: 12 additions & 12 deletions benchmark/dsl/matmul_dequantize_af.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import tvm
import bitblas
from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
from bitblas.base.roller.arch import CUDA
Expand All @@ -12,6 +11,7 @@
matmul_nt_dequantize_b,
matmul_nt_dequantize_b_propagate_a_propagate_b,
)
import tvm
import time
import argparse

Expand Down Expand Up @@ -181,15 +181,15 @@
else:
t = 1e4 - 1

print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))

profile_config = {
f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
"fast_dlight_top20_tune_time": fast_tune_time,
"fast_dlight_top1_latency": cpresults[0].latency,
"fast_dlight_top20_latency": best.latency,
"default_dlight_tune_time": default_tune_time,
"default_dlight_latency": t * 1e3 if t is not None else "Failed",
"fast_bitblas_top20_tune_time": fast_tune_time,
"fast_bitblas_top1_latency": cpresults[0].latency,
"fast_bitblas_top20_latency": best.latency,
"default_bitblas_tune_time": default_tune_time,
"default_bitblas_latency": t * 1e3 if t is not None else "Failed",
}
}

Expand Down Expand Up @@ -219,10 +219,10 @@
row = [
func_name,
input_args,
f" {str(values['fast_dlight_top20_tune_time'])} s",
f"{values['fast_dlight_top1_latency']:.3f} ms",
f"{values['fast_dlight_top20_latency']:.3f} ms",
str(values["default_dlight_tune_time"]),
f"{values['default_dlight_latency']:.3e} ms",
f" {str(values['fast_bitblas_top20_tune_time'])} s",
f"{values['fast_bitblas_top1_latency']:.3f} ms",
f"{values['fast_bitblas_top20_latency']:.3f} ms",
str(values["default_bitblas_tune_time"]),
f"{values['default_bitblas_latency']:.3e} ms",
]
print("".join(word.ljust(col_width) for word in row))
24 changes: 12 additions & 12 deletions benchmark/dsl/matmul_dequantize_fp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import tvm
import bitblas
from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
from bitblas.base.roller.arch import CUDA
Expand All @@ -12,6 +11,7 @@
matmul_nt_dequantize_b,
matmul_nt_dequantize_b_propagate_a_propagate_b,
)
import tvm
import time
import argparse

Expand Down Expand Up @@ -180,15 +180,15 @@
else:
t = 1e4 - 1

print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))

profile_config = {
f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
"fast_dlight_top20_tune_time": fast_tune_time,
"fast_dlight_top1_latency": cpresults[0].latency,
"fast_dlight_top20_latency": best.latency,
"default_dlight_tune_time": default_tune_time,
"default_dlight_latency": t * 1e3 if t is not None else "Failed",
"fast_bitblas_top20_tune_time": fast_tune_time,
"fast_bitblas_top1_latency": cpresults[0].latency,
"fast_bitblas_top20_latency": best.latency,
"default_bitblas_tune_time": default_tune_time,
"default_bitblas_latency": t * 1e3 if t is not None else "Failed",
}
}

Expand Down Expand Up @@ -218,10 +218,10 @@
row = [
func_name,
input_args,
f" {str(values['fast_dlight_top20_tune_time'])} s",
f"{values['fast_dlight_top1_latency']:.3f} ms",
f"{values['fast_dlight_top20_latency']:.3f} ms",
str(values["default_dlight_tune_time"]),
f"{values['default_dlight_latency']:.3e} ms",
f" {str(values['fast_bitblas_top20_tune_time'])} s",
f"{values['fast_bitblas_top1_latency']:.3f} ms",
f"{values['fast_bitblas_top20_latency']:.3f} ms",
str(values["default_bitblas_tune_time"]),
f"{values['default_bitblas_latency']:.3e} ms",
]
print("".join(word.ljust(col_width) for word in row))
30 changes: 16 additions & 14 deletions benchmark/dsl/matmul_dequantize_int1.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import tvm
import bitblas
from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
from bitblas.base.roller.arch import CUDA
Expand All @@ -12,9 +11,10 @@
matmul_nt_dequantize_b,
matmul_nt_dequantize_b_propagate_a_propagate_b,
)
import tvm
import time
import argparse

bitblas.set_log_level("DEBUG")
# append a parser for the benchmark set

parser = argparse.ArgumentParser(description="Benchmark BitBLAS int8xint1 on a specific target.")
Expand Down Expand Up @@ -50,7 +50,7 @@

llm_int8xint1 = [
# square test
(matmul_nt_dequantize_b, (1, 16384, 16384, "int8", "int8", "int32", 1, "int8", "uint", False,
(matmul_nt_dequantize_b, (1, 16384, 16384, "int8", "int8", "int32", 1, "int8", "int", False,
False, group_size, True, False), Matmul),
# BLOOM-176B
(matmul_nt_dequantize_b, (1, 43008, 14336, "int8", "int8", "int32", 1, "int8", "uint", False,
Expand Down Expand Up @@ -150,6 +150,8 @@
tune_start = time.time()
cpresults, best = apply_and_build(func, configs, arch, parallel_build=True)
fast_tune_time = time.time() - tune_start
# print(best.sch.mod)
print(best.code)
print("[BitBLAS] The best latency of top 1 is {:.3f} ms".format(cpresults[0].latency))
print("[BitBLAS] The best latency of top 20 is {:.3f} ms".format(best.latency))

Expand Down Expand Up @@ -183,15 +185,15 @@
else:
t = 1e4 - 1

print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))

profile_config = {
f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
"fast_dlight_top20_tune_time": fast_tune_time,
"fast_dlight_top1_latency": cpresults[0].latency,
"fast_dlight_top20_latency": best.latency,
"default_dlight_tune_time": default_tune_time,
"default_dlight_latency": t * 1e3 if t is not None else "Failed",
"fast_bitblas_top20_tune_time": fast_tune_time,
"fast_bitblas_top1_latency": cpresults[0].latency,
"fast_bitblas_top20_latency": best.latency,
"default_bitblas_tune_time": default_tune_time,
"default_bitblas_latency": t * 1e3 if t is not None else "Failed",
}
}

Expand Down Expand Up @@ -221,10 +223,10 @@
row = [
func_name,
input_args,
f" {str(values['fast_dlight_top20_tune_time'])} s",
f"{values['fast_dlight_top1_latency']:.3f} ms",
f"{values['fast_dlight_top20_latency']:.3f} ms",
str(values["default_dlight_tune_time"]),
f"{values['default_dlight_latency']:.3e} ms",
f" {str(values['fast_bitblas_top20_tune_time'])} s",
f"{values['fast_bitblas_top1_latency']:.3f} ms",
f"{values['fast_bitblas_top20_latency']:.3f} ms",
str(values["default_bitblas_tune_time"]),
f"{values['default_bitblas_latency']:.3e} ms",
]
print("".join(word.ljust(col_width) for word in row))
24 changes: 12 additions & 12 deletions benchmark/dsl/matmul_dequantize_int4.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import tvm
import bitblas
from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
from bitblas.base.roller.arch import CUDA
Expand All @@ -12,6 +11,7 @@
matmul_nt_dequantize_b,
matmul_nt_dequantize_b_propagate_a_propagate_b,
)
import tvm
import time
import argparse

Expand Down Expand Up @@ -250,15 +250,15 @@
else:
t = 1e4 - 1

print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))

profile_config = {
f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
"fast_dlight_top20_tune_time": fast_tune_time,
"fast_dlight_top1_latency": cpresults[0].latency,
"fast_dlight_top20_latency": best.latency,
"default_dlight_tune_time": default_tune_time,
"default_dlight_latency": t * 1e3 if t is not None else "Failed",
"fast_bitblas_top20_tune_time": fast_tune_time,
"fast_bitblas_top1_latency": cpresults[0].latency,
"fast_bitblas_top20_latency": best.latency,
"default_bitblas_tune_time": default_tune_time,
"default_bitblas_latency": t * 1e3 if t is not None else "Failed",
}
}

Expand Down Expand Up @@ -288,10 +288,10 @@
row = [
func_name,
input_args,
f" {str(values['fast_dlight_top20_tune_time'])} s",
f"{values['fast_dlight_top1_latency']:.3f} ms",
f"{values['fast_dlight_top20_latency']:.3f} ms",
str(values["default_dlight_tune_time"]),
f"{values['default_dlight_latency']:.3e} ms",
f" {str(values['fast_bitblas_top20_tune_time'])} s",
f"{values['fast_bitblas_top1_latency']:.3f} ms",
f"{values['fast_bitblas_top20_latency']:.3f} ms",
str(values["default_bitblas_tune_time"]),
f"{values['default_bitblas_latency']:.3e} ms",
]
print("".join(word.ljust(col_width) for word in row))
24 changes: 12 additions & 12 deletions benchmark/dsl/weight_propagate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import tvm
import bitblas
from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy
from bitblas.base.roller.arch import CUDA
Expand All @@ -17,6 +16,7 @@
matmul_nt_dequantize_b,
matmul_nt_dequantize_b_propagate_b,
)
import tvm
import time
import argparse

Expand Down Expand Up @@ -512,15 +512,15 @@
else:
t = 1e4 - 1

print("Time cost of Dlight default schedule: {:.3f} ms".format(t * 1e3))
print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3))

profile_config = {
f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": {
"fast_dlight_top20_tune_time": fast_tune_time,
"fast_dlight_top1_latency": cpresults[0].latency,
"fast_dlight_top20_latency": best.latency,
"default_dlight_tune_time": default_tune_time,
"default_dlight_latency": t * 1e3 if t is not None else "Failed",
"fast_bitblas_top20_tune_time": fast_tune_time,
"fast_bitblas_top1_latency": cpresults[0].latency,
"fast_bitblas_top20_latency": best.latency,
"default_bitblas_tune_time": default_tune_time,
"default_bitblas_latency": t * 1e3 if t is not None else "Failed",
}
}

Expand Down Expand Up @@ -550,10 +550,10 @@
row = [
func_name,
input_args,
f" {str(values['fast_dlight_top20_tune_time'])} s",
f"{values['fast_dlight_top1_latency']:.3f} ms",
f"{values['fast_dlight_top20_latency']:.3f} ms",
str(values["default_dlight_tune_time"]),
f"{values['default_dlight_latency']:.3e} ms",
f" {str(values['fast_bitblas_top20_tune_time'])} s",
f"{values['fast_bitblas_top1_latency']:.3f} ms",
f"{values['fast_bitblas_top20_latency']:.3f} ms",
str(values["default_bitblas_tune_time"]),
f"{values['default_bitblas_latency']:.3e} ms",
]
print("".join(word.ljust(col_width) for word in row))
2 changes: 1 addition & 1 deletion python/bitblas/base/roller/bestfit.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Benifit For Dlight Schedule"""
"""Benifit For BitBLAS Schedule"""
class Block:
def __init__(self, start, end, is_free):
self.start = start
Expand Down
2 changes: 1 addition & 1 deletion python/bitblas/base/roller/policy/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
class DefaultPolicy:
"""
Default Policy for fastdlight, a heuristic plan that tries to
minimize memory traffic and maximize parallelism.for Dlight Schedule.
minimize memory traffic and maximize parallelism.for BitBLAS Schedule.
"""

def __init__(self,
Expand Down
11 changes: 7 additions & 4 deletions python/bitblas/gpu/intrin/lop3.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,9 @@
decode_i1s_to_i8s = """template <typename T1, typename T2>
__device__ void decode_i1s_to_i8s(T1 *_i1b, T2 *_i8s, const int N = 16)
{
int *i8s = reinterpret_cast<int *>(_i8s);
int i8s[4];
// vector load
*reinterpret_cast<int4 *>(i8s) = *reinterpret_cast<int4 *>(_i8s);
int16_t i1b_i16 = *reinterpret_cast<int16_t *>(_i1b);
// permutate: {e0,e4,e8,e12,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15}
// into: {e0,e4,e8,e12,x,x,x,x,e1,e5,e9,x,x,x,x,e13,e2,e6,e10,e14,e1,e5,e9,e13,e3,e7,e11,e15,x,x,x,x}
Expand All @@ -577,16 +579,17 @@
static constexpr uint immLut = (0xf0 & 0xcc) | 0xaa; // 0b11101010
static constexpr uint BOTTOM_MASK = 0x01010101; // 0x1 -> 0b01 select 0,1
static constexpr uint I8s_MAGIC_NUM = 0x00000000;
static constexpr uint MEDIAN_NUM = 0x00000000;
static constexpr uint TRANSFORM_SUBTRACT = 0x01010101;
static constexpr uint TRANSFORM_SUBTRACT = 0xffffffff; // for signed int 2x - 1
for (int i = 0; i < N / 4; i++)
{
asm volatile("lop3.b32 %0, %1, %2, %3, %4;\\n"
: "=r"(i8s[i])
: "r"(i1b >> i), "n"(BOTTOM_MASK), "n"(I8s_MAGIC_NUM), "n"(immLut));
i8s[i] = __vsubss4(__vaddss4(i8s[i], i8s[i]), TRANSFORM_SUBTRACT);
i8s[i] = __vadd4(i8s[i], i8s[i]);
i8s[i] = __vadd4(i8s[i], TRANSFORM_SUBTRACT);
}
*reinterpret_cast<int4 *>(_i8s) = *reinterpret_cast<int4 *>(i8s);
}
template <typename T1, typename T2>
Expand Down
Loading

0 comments on commit 5fad509

Please sign in to comment.