Skip to content

Commit

Permalink
[Dev] Refactor testing scripts and fix security issues (#72)
Browse files Browse the repository at this point in the history
* chore: Update support matrix in README

* Move bitblas package to root

* Remove unused code files

* Create soft link for tvm

* Create soft link for tvm

* Update softlink paths for tvm in setup.py

* Refactor import statements to use relative paths

* fix test linear

* Move bitblas package to root

* Move bitblas package to root

* refactor splitk test

* Fix assert statement in ladder_permutate_impl.py

* Refactor test_ladder_permutate_ops.py for improved readability and maintainability

* Refactor test_ladder_permutate_ops.py for improved readability and maintainability

* improve and evaluate the test scripts.

* resolve security issue.
  • Loading branch information
LeiWang1999 committed Jul 4, 2024
1 parent f4e15a5 commit 8804d77
Show file tree
Hide file tree
Showing 16 changed files with 56 additions and 1,649 deletions.
1 change: 1 addition & 0 deletions bitblas/ops/impl/ladder_permutate_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def select_implementation(
inp = te.placeholder((M, N // scaling_factor), name="inp", dtype=storage_dtype)
args = [inp]

assert transform_kind != 0, "Permute only apply when transform_kind >= 1"
if transform_kind >= 1:
arg = args[-1]

Expand Down
3 changes: 1 addition & 2 deletions bitblas/ops/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,7 @@ def tvm_callback_cuda_postproc(code, _):
**self.pass_context
}):
rt_mod = tvm.build(self.optimized_func, target=target, name=self.name)
except Exception as e:
rt_build_error = e # noqa
except Exception: # noqa: F841
logger.debug(
"Failed to build optimized function for CUDA target with default schedule, Please consider enable hardware aware tuning!"
)
Expand Down
2 changes: 1 addition & 1 deletion integration/BitNet/modeling_bitnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa: F401


logger = logging.get_logger(__name__)
Expand Down
55 changes: 25 additions & 30 deletions testing/python/operators/test_general_matmul_splitk_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,7 @@ def get_codegen_result(ops):


# fmt: off
@pytest.mark.parametrize(
"M,N,K,A_dtype,W_dtype,accum_dtype,out_dtype,layout,with_bias,group_size,with_scaling,with_zeros,zeros_mode",
[
(1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False, False,
None),
(16, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False, False,
None),
],
)
def test_matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout,
def matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode):

matmul_config = MatmulConfigWithSplitK(
Expand All @@ -37,21 +28,21 @@ def test_matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtyp
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
propagate_a=False,
propagate_b=False,
)
matmul = MatmulWithSplitK(config=matmul_config, enable_tuning=False)
assert get_codegen_result(matmul)


@pytest.mark.parametrize(
"SPlitK,M,N,K,A_dtype,W_dtype,accum_dtype,out_dtype,layout,with_bias,group_size,with_scaling,with_zeros,zeros_mode",
[
(1, 1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False,
False, None),
(4, 1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False,
False, None),
],
)
def test_matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
def test_matmul_codegen_default():
matmul_codegen_default(1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False, False,
None)
matmul_codegen_default(16, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False, False,
None)


def matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
layout, with_bias, group_size, with_scaling, with_zeros,
zeros_mode):
import torch
Expand All @@ -71,6 +62,8 @@ def test_matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accu
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
propagate_a=False,
propagate_b=False,
)
matmul = MatmulWithSplitK(config=matmul_config, enable_tuning=False)

Expand All @@ -84,17 +77,13 @@ def test_matmul_torch_forward_consistent(SplitK, M, N, K, A_dtype, W_dtype, accu
output_torch = torch.matmul(inputs[0], inputs[1].t() if layout == "nt" else inputs[1])
torch.testing.assert_close(output_bitblas, output_torch, rtol=1e-2, atol=1e-1)

def test_matmul_torch_forward_consistent():
matmul_torch_forward_consistent(1, 1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False,
False, None)
matmul_torch_forward_consistent(4, 1, 4096, 12800, "float16", "float16", "float16", "float16", "nt", False, -1, False,
False, None)

@pytest.mark.parametrize(
"SPlitK,M,N,K,A_dtype,W_dtype,accum_dtype,out_dtype,layout,with_bias,group_size,with_scaling,with_zeros,zeros_mode",
[
(1, 16, 4096, 12800, "float16", "e4m3_float8", "float32", "float16", "nt", False, -1, False,
False, None),
(4, 16, 4096, 12800, "float16", "e4m3_float8", "float32", "float16", "nt", False, -1, False,
False, None),
],
)
def test_matmul_torch_forward_fp8e4m3(SplitK, M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
def matmul_torch_forward_fp8e4m3(SplitK, M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
layout, with_bias, group_size, with_scaling, with_zeros,
zeros_mode):
import torch
Expand Down Expand Up @@ -157,6 +146,12 @@ def map_torch_type(intype):

torch.testing.assert_close(bitblas_out, ref_out, rtol=1e0, atol=1e-1)

@bitblas.testing.requires_cuda_compute_version(8, 9)
def test_matmul_torch_forward_fp8e4m3():
matmul_torch_forward_fp8e4m3(1, 16, 4096, 12800, "e4m3_float8", "e4m3_float8", "float32", "float16", "nt", False, -1, False,
False, None)
matmul_torch_forward_fp8e4m3(4, 16, 4096, 12800, "e4m3_float8", "e4m3_float8", "float32", "float16", "nt", False, -1, False,
False, None)

# fmt: on
if __name__ == "__main__":
Expand Down
32 changes: 12 additions & 20 deletions testing/python/operators/test_ladder_permutate_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,7 @@


# fmt: off
@pytest.mark.parametrize(
"M,N,datatype,dequantize_bits,storage_dtype,propagate_kind,transpose_matrix,transform_kind,target_instruction",
[
(1024, 1024, "float16", -1, "float16", "B", True, 0, "nvidia-mma"),
(1024, 1024, "float16", -1, "float16", "B", True, 1, "nvidia-mma"),
(1024, 1024, "float16", -1, "float16", "B", True, 2, "nvidia-mma"),
# dequantize propagation
(1024, 1024, "float16", 4, "uint32", "B", True, 2, "nvidia-mma"),
])
def test_ladder_permutate_profile_latency(
def ladder_permutate_profile_latency(
M,
N,
datatype,
Expand Down Expand Up @@ -49,16 +40,13 @@ def test_ladder_permutate_profile_latency(
assert latency


@pytest.mark.parametrize(
"M,N,datatype,dequantize_bits,storage_dtype,propagate_kind,transpose_matrix,transform_kind,target_instruction",
[
(1024, 1024, "float16", -1, "float16", "A", True, 0, "nvidia-mma"),
(1024, 1024, "float16", -1, "float16", "A", True, 1, "nvidia-mma"),
(1024, 1024, "float16", -1, "float16", "A", True, 2, "nvidia-mma"),
# dequantize propagation
(1024, 1024, "float16", 4, "uint32", "A", True, 2, "nvidia-mma"),
])
def test_ladder_permutate_profile_latency_cuda(
def test_ladder_permutate_profile_latency():
ladder_permutate_profile_latency(1024, 1024, "float16", -1, "float16", "B", True, 1, "nvidia-mma")
ladder_permutate_profile_latency(1024, 1024, "float16", -1, "float16", "B", True, 2, "nvidia-mma")
ladder_permutate_profile_latency(1024, 1024, "float16", 4, "uint32", "B", True, 2, "nvidia-mma")


def ladder_permutate_profile_latency_cuda(
M,
N,
datatype,
Expand Down Expand Up @@ -91,6 +79,10 @@ def test_ladder_permutate_profile_latency_cuda(
assert latency


def test_ladder_permutate_profile_latency_cuda():
ladder_permutate_profile_latency_cuda(1024, 1024, "float16", -1, "float16", "A", True, 1, "nvidia-mma")
ladder_permutate_profile_latency_cuda(1024, 1024, "float16", -1, "float16", "A", True, 2, "nvidia-mma")
ladder_permutate_profile_latency_cuda(1024, 1024, "float16", 4, "uint32", "A", True, 2, "nvidia-mma")
# fmt: on

if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 8804d77

Please sign in to comment.