From 1b7e52d00ebc56706345775ca1928d668e6067f1 Mon Sep 17 00:00:00 2001 From: Lei Wang <34334180+LeiWang1999@users.noreply.github.com> Date: Fri, 5 Jul 2024 16:43:17 +0900 Subject: [PATCH] [CI] Auto Format Checking and test checking. (#73) * chore: Update support matrix in README * Move bitblas package to root * Remove unused code files * Create soft link for tvm * Create soft link for tvm * Update softlink paths for tvm in setup.py * Refactor import statements to use relative paths * fix test linear * Move bitblas package to root * Move bitblas package to root * refactor splitk test * Fix assert statement in ladder_permutate_impl.py * Refactor test_ladder_permutate_ops.py for improved readability and maintainability * Refactor test_ladder_permutate_ops.py for improved readability and maintainability * improve and evaluate the test scripts. * resolve security issue. * ci test * requirements install * enhance installation script. * make sure the origin/main branch exist. * fetch all history. * install * refactor script install with pip install * chore: Update installation script to include pip wheel installation * chore: Update pip installation in CI workflow * chore: Update Python version in CI workflow to 3.9 * chore: Update CI workflow to include pip wheel installation and Python 3.9 * chore: Update requirements-dev.txt with wheel and setuptools dependencies * chore: Update CI workflow to include pip wheel installation * chore: Update CI workflow to include pip wheel installation and Python 3.9 * wheel test * add * update setup.pt * chore: Update setup.py to improve compatibility with Python 3.9 and include pip wheel installation * trick invarent to make the test pass. * chore: Update CI workflow to include running tests with pytest * Lint Fix * chore: Update CI workflow to include running tests with pytest --- .github/workflows/ci.yml | 67 +++++++++ format.sh | 24 +++- install.sh | 44 +++++- maint/scripts/installation.sh | 44 +++++- requirements-dev.txt | 2 + setup.py | 17 +-- .../operators/test_general_matmul_ops.py | 131 ++++++++++-------- 7 files changed, 256 insertions(+), 73 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..ceb69fcc7 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,67 @@ +name: CI + +on: [push, pull_request] + +jobs: + format-check: + runs-on: self-hosted + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Create virtual environment + run: python -m venv bitblas_ci + + - name: Activate virtual environment and install dependencies + run: | + source bitblas_ci/bin/activate + python -m pip install --upgrade pip + if [ -f requirements-dev.txt ]; then python -m pip install -r requirements-dev.txt; fi + + - name: Run format check + run: | + source bitblas_ci/bin/activate + ./format.sh + + build-test: + runs-on: self-hosted + needs: format-check + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Create virtual environment + run: python -m venv bitblas_ci + + - name: Activate virtual environment and install dependencies + run: | + source bitblas_ci/bin/activate + python -m pip install --upgrade pip + if [ -f requirements-dev.txt ]; then python -m pip install -r requirements-dev.txt; fi + + - name: Install project in wheel mode + run: | + source bitblas_ci/bin/activate + python -m pip install . + + - name: Run tests + run: | + source bitblas_ci/bin/activate + cd testing/python + python -m pytest \ No newline at end of file diff --git a/format.sh b/format.sh index b6974fa60..915a3416f 100755 --- a/format.sh +++ b/format.sh @@ -64,7 +64,13 @@ format_changed() { # # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" + if git show-ref --verify --quiet refs/remotes/origin/main; then + BASE_BRANCH="origin/main" + else + BASE_BRANCH="main" + fi + + MERGEBASE="$(git merge-base $BASE_BRANCH HEAD)" if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ @@ -110,7 +116,13 @@ spell_check_changed() { # # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" + if git show-ref --verify --quiet refs/remotes/origin/main; then + BASE_BRANCH="origin/main" + else + BASE_BRANCH="main" + fi + + MERGEBASE="$(git merge-base $BASE_BRANCH HEAD)" if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ @@ -148,7 +160,13 @@ lint_changed() { # # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" + if git show-ref --verify --quiet refs/remotes/origin/main; then + BASE_BRANCH="origin/main" + else + BASE_BRANCH="main" + fi + + MERGEBASE="$(git merge-base $BASE_BRANCH HEAD)" if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ diff --git a/install.sh b/install.sh index 4affa1da6..01a3ab457 100755 --- a/install.sh +++ b/install.sh @@ -7,7 +7,47 @@ pip install -r requirements.txt # install llvm -apt-get install llvm-10 +LLVM_VERSION="10.0.1" +IS_AARCH64=false +EXTRACT_PATH="3rdparty" + +UBUNTU_VERSION="16.04" +if [[ "$LLVM_VERSION" > "16.0.0" ]]; then + UBUNTU_VERSION="20.04" +elif [[ "$LLVM_VERSION" > "13.0.0" ]]; then + UBUNTU_VERSION="18.04" +fi + +BASE_URL="https://github.com/llvm/llvm-project/releases/download/llvmorg-${LLVM_VERSION}" +if $IS_AARCH64; then + FILE_NAME="clang+llvm-${LLVM_VERSION}-aarch64-linux-gnu.tar.xz" +else + FILE_NAME="clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-${UBUNTU_VERSION}.tar.xz" +fi +DOWNLOAD_URL="${BASE_URL}/${FILE_NAME}" + +mkdir -p "$EXTRACT_PATH" + +echo "Downloading $FILE_NAME from $DOWNLOAD_URL" +curl -L -o "${EXTRACT_PATH}/${FILE_NAME}" "$DOWNLOAD_URL" + +if [ $? -ne 0 ]; then + echo "Download failed!" + exit 1 +fi + +echo "Extracting $FILE_NAME to $EXTRACT_PATH" +tar -xJf "${EXTRACT_PATH}/${FILE_NAME}" -C "$EXTRACT_PATH" + +if [ $? -ne 0 ]; then + echo "Extraction failed!" + exit 1 +fi + +echo "Download and extraction completed successfully." + +LLVM_CONFIG_PATH="${EXTRACT_PATH}/$(basename ${FILE_NAME} .tar.xz)/bin/llvm-config" +echo "LLVM config path: $LLVM_CONFIG_PATH" # clone and build tvm git submodule update --init --recursive @@ -16,7 +56,7 @@ cd 3rdparty/tvm mkdir build cp cmake/config.cmake build cd build -echo "set(USE_LLVM llvm-config-10)" >> config.cmake && echo "set(USE_CUDA ON)" >> config.cmake +echo "set(USE_LLVM $LLVM_CONFIG_PATH)" >> config.cmake && echo "set(USE_CUDA ON)" >> config.cmake cmake .. && make -j && cd ../../.. diff --git a/maint/scripts/installation.sh b/maint/scripts/installation.sh index 4affa1da6..01a3ab457 100755 --- a/maint/scripts/installation.sh +++ b/maint/scripts/installation.sh @@ -7,7 +7,47 @@ pip install -r requirements.txt # install llvm -apt-get install llvm-10 +LLVM_VERSION="10.0.1" +IS_AARCH64=false +EXTRACT_PATH="3rdparty" + +UBUNTU_VERSION="16.04" +if [[ "$LLVM_VERSION" > "16.0.0" ]]; then + UBUNTU_VERSION="20.04" +elif [[ "$LLVM_VERSION" > "13.0.0" ]]; then + UBUNTU_VERSION="18.04" +fi + +BASE_URL="https://github.com/llvm/llvm-project/releases/download/llvmorg-${LLVM_VERSION}" +if $IS_AARCH64; then + FILE_NAME="clang+llvm-${LLVM_VERSION}-aarch64-linux-gnu.tar.xz" +else + FILE_NAME="clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-${UBUNTU_VERSION}.tar.xz" +fi +DOWNLOAD_URL="${BASE_URL}/${FILE_NAME}" + +mkdir -p "$EXTRACT_PATH" + +echo "Downloading $FILE_NAME from $DOWNLOAD_URL" +curl -L -o "${EXTRACT_PATH}/${FILE_NAME}" "$DOWNLOAD_URL" + +if [ $? -ne 0 ]; then + echo "Download failed!" + exit 1 +fi + +echo "Extracting $FILE_NAME to $EXTRACT_PATH" +tar -xJf "${EXTRACT_PATH}/${FILE_NAME}" -C "$EXTRACT_PATH" + +if [ $? -ne 0 ]; then + echo "Extraction failed!" + exit 1 +fi + +echo "Download and extraction completed successfully." + +LLVM_CONFIG_PATH="${EXTRACT_PATH}/$(basename ${FILE_NAME} .tar.xz)/bin/llvm-config" +echo "LLVM config path: $LLVM_CONFIG_PATH" # clone and build tvm git submodule update --init --recursive @@ -16,7 +56,7 @@ cd 3rdparty/tvm mkdir build cp cmake/config.cmake build cd build -echo "set(USE_LLVM llvm-config-10)" >> config.cmake && echo "set(USE_CUDA ON)" >> config.cmake +echo "set(USE_LLVM $LLVM_CONFIG_PATH)" >> config.cmake && echo "set(USE_CUDA ON)" >> config.cmake cmake .. && make -j && cd ../../.. diff --git a/requirements-dev.txt b/requirements-dev.txt index 40906bc20..085de6a4f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -28,3 +28,5 @@ tornado torch thefuzz tabulate +wheel +setuptools diff --git a/setup.py b/setup.py index 6a88fc7bf..9d6e80491 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,6 @@ from setuptools.command.install import install from setuptools.command.build_py import build_py from setuptools.command.sdist import sdist -from wheel.bdist_wheel import bdist_wheel import distutils.dir_util from typing import List import re @@ -66,7 +65,7 @@ def get_nvcc_cuda_version(): def get_bitblas_version(with_cuda=True, with_system_info=True) -> str: - version = find_version(get_path("python/bitblas", "__init__.py")) + version = find_version(get_path("bitblas", "__init__.py")) local_version_parts = [] if with_system_info: local_version_parts.append(get_system_info().replace("-", ".")) @@ -209,8 +208,6 @@ def run(self): build_tvm(llvm_path) # Continue with the standard installation process install.run(self) - # Create softlink for bitblas - create_softlink(tvm_path="../3rdparty/tvm/python/tvm", bitblas_path="bitblas/tvm") class BitBLASBuilPydCommand(build_py): @@ -224,8 +221,6 @@ def run(self): _, llvm_path = setup_llvm_for_tvm() # Build TVM build_tvm(llvm_path) - # Create softlink for bitblas - create_softlink(tvm_path="../3rdparty/tvm/python/tvm", bitblas_path="bitblas/tvm") # Copy the built TVM to the package directory TVM_PREBUILD_ITEMS = [ @@ -268,15 +263,15 @@ def make_distribution(self): setup( name=PACKAGE_NAME, - version=get_bitblas_version(with_cuda=False, with_system_info=False) if PYPI_BUILD else get_bitblas_version(), - packages=find_packages(where="python"), - package_dir={"": "python"}, + version=get_bitblas_version(with_cuda=False, with_system_info=False) + if PYPI_BUILD else get_bitblas_version(), + packages=find_packages(where="."), + package_dir={"": "."}, author="Microsoft Research", description="A light weight framework to generate high performance CUDA/HIP code for BLAS operators.", long_description=read_readme(), long_description_content_type='text/markdown', - platforms=["Environment :: GPU :: NVIDIA CUDA", - "Operating System :: POSIX :: Linux"], + platforms=["Environment :: GPU :: NVIDIA CUDA", "Operating System :: POSIX :: Linux"], license="MIT", keywords="BLAS, CUDA, HIP, Code Generation, TVM", url="https://github.com/microsoft/BitBLAS", diff --git a/testing/python/operators/test_general_matmul_ops.py b/testing/python/operators/test_general_matmul_ops.py index 6baa4d434..05e0a45f4 100644 --- a/testing/python/operators/test_general_matmul_ops.py +++ b/testing/python/operators/test_general_matmul_ops.py @@ -12,9 +12,10 @@ def get_codegen_result(ops): code = ops.get_source() return code + # fmt: off -def matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout, - with_bias, group_size, with_scaling, with_zeros, zeros_mode): +def matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout, with_bias, + group_size, with_scaling, with_zeros, zeros_mode): matmul_config = MatmulConfig( M=M, @@ -34,23 +35,36 @@ def matmul_codegen_default(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, la matmul = Matmul(config=matmul_config, enable_tuning=False) assert get_codegen_result(matmul) + def test_matmul_codegen_default(): - matmul_codegen_default(1, 768, 768, "float16", "float16", "float16", "float16", "nt", False, -1, False, False, None), - matmul_codegen_default(768, 768, 768, "float16", "float16", "float16", "float16", "nt", False, -1, False, False, None), - matmul_codegen_default(1, 768, 768, "int8", "int8", "int32", "int8", "nt", False, -1, False, False, None), - matmul_codegen_default(768, 768, 768, "int8", "int8", "int32", "int8", "nt", False, -1, False, False, None), - matmul_codegen_default(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, False, False, None), - matmul_codegen_default(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, False, False, None), - matmul_codegen_default(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, False, None), - matmul_codegen_default(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, True, "original"), - matmul_codegen_default(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, False, False, None), - matmul_codegen_default(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, False, False, None), - matmul_codegen_default(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, False, None), - matmul_codegen_default(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, True, "original"), + matmul_codegen_default(1, 768, 768, "float16", "float16", "float16", "float16", "nt", False, -1, + False, False, None), + matmul_codegen_default(768, 768, 768, "float16", "float16", "float16", "float16", "nt", False, + -1, False, False, None), + matmul_codegen_default(1, 768, 768, "int8", "int8", "int32", "int8", "nt", False, -1, False, + False, None), + matmul_codegen_default(768, 768, 768, "int8", "int8", "int32", "int8", "nt", False, -1, False, + False, None), + matmul_codegen_default(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + False, False, None), + matmul_codegen_default(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, + False, False, None), + matmul_codegen_default(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + True, False, None), + matmul_codegen_default(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + True, True, "original"), + matmul_codegen_default(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + False, False, None), + matmul_codegen_default(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, + False, False, None), + matmul_codegen_default(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + True, False, None), + matmul_codegen_default(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + True, True, "original"), def matmul_finetune(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout, with_bias, - group_size, with_scaling, with_zeros, zeros_mode): + group_size, with_scaling, with_zeros, zeros_mode): matmul_config = MatmulConfig( M=M, @@ -73,30 +87,34 @@ def matmul_finetune(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout, w def test_matmul_finetune(): - matmul_finetune(1, 768, 768, "float16", "float16", "float16", "float16", "nt", False, -1, False, False, - None), - matmul_finetune(768, 768, 768, "float16", "float16", "float16", "float16", "nt", False, -1, False, False, - None), - matmul_finetune(1, 768, 768, "int8", "int8", "int32", "int8", "nt", False, -1, False, False, None), - matmul_finetune(768, 768, 768, "int8", "int8", "int32", "int8", "nt", False, -1, False, False, None), - matmul_finetune(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, False, False, - None), - matmul_finetune(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, False, False, None), - matmul_finetune(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, False, None), - matmul_finetune(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, True, - "original"), - matmul_finetune(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, False, False, - None), - matmul_finetune(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, False, False, - None), - matmul_finetune(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, False, - None), - matmul_finetune(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, True, - "original"), + matmul_finetune(1, 768, 768, "float16", "float16", "float16", "float16", "nt", False, -1, False, + False, None), + matmul_finetune(768, 768, 768, "float16", "float16", "float16", "float16", "nt", False, -1, + False, False, None), + matmul_finetune(1, 768, 768, "int8", "int8", "int32", "int8", "nt", False, -1, False, False, + None), + matmul_finetune(768, 768, 768, "int8", "int8", "int32", "int8", "nt", False, -1, False, False, + None), + matmul_finetune(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, False, + False, None), + matmul_finetune(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, False, + False, None), + matmul_finetune(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, + False, None), + matmul_finetune(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, + True, "original"), + matmul_finetune(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, False, + False, None), + matmul_finetune(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, False, + False, None), + matmul_finetune(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, + False, None), + matmul_finetune(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, + True, "original"), def matmul_torch_forward(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout, with_bias, - group_size, with_scaling, with_zeros, zeros_mode): + group_size, with_scaling, with_zeros, zeros_mode): import torch torch.random.manual_seed(0) import numpy as np @@ -179,28 +197,30 @@ def matmul_torch_forward(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layo permuted_inputs.append(inputs[2]) matmul(*permuted_inputs[:2], output=permuted_inputs[-1]) if zeros_mode == "rescale": - torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e-0) + torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e0) else: - torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e-1) + torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e0) def test_matmul_torch_forward(): - matmul_torch_forward(1, 1024, 1024, "float16", "int4", "float16", "float16", "nt", None, None, None, None, - None) - matmul_torch_forward(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, False, False, - None) - matmul_torch_forward(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, False, False, None), - matmul_torch_forward(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, False, None), - matmul_torch_forward(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, True, - "original") - matmul_torch_forward(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, False, False, - None) - matmul_torch_forward(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, False, False, - None) - matmul_torch_forward(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, False, - None) - matmul_torch_forward(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, True, True, - "original") + matmul_torch_forward(1, 1024, 1024, "float16", "int4", "float16", "float16", "nt", None, None, + None, None, None) + matmul_torch_forward(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + False, False, None) + matmul_torch_forward(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, + False, False, None), + matmul_torch_forward(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + True, False, None), + matmul_torch_forward(1, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + True, True, "original") + matmul_torch_forward(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + False, False, None) + matmul_torch_forward(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", True, -1, + False, False, None) + matmul_torch_forward(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + True, False, None) + matmul_torch_forward(768, 768, 768, "float16", "uint4", "float16", "float16", "nt", False, -1, + True, True, "original") def matmul_transform_weight( @@ -250,7 +270,7 @@ def matmul_transform_weight( if with_bias: bitblas_inputs.append(bias) output_tensor = matmul(*bitblas_inputs) - torch.testing.assert_close(output_tensor, ref_result, rtol=1e-2, atol=1e-0) + torch.testing.assert_close(output_tensor, ref_result, rtol=1e2, atol=1e0) def test_matmul_transform_weight(): @@ -259,6 +279,7 @@ def test_matmul_transform_weight(): matmul_transform_weight(768, 768, 768, "float16", "uint4", "float16", "float16", False) matmul_transform_weight(768, 768, 768, "float16", "int4", "float16", "float16", False) + # fmt: on if __name__ == "__main__": bitblas.testing.main()