diff --git a/python/bitblas/__init__.py b/bitblas/__init__.py similarity index 97% rename from python/bitblas/__init__.py rename to bitblas/__init__.py index 14b510845..172c4cbf1 100644 --- a/python/bitblas/__init__.py +++ b/bitblas/__init__.py @@ -11,7 +11,7 @@ sys.path.insert(0, install_tvm_path) develop_tvm_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "..", "..", "3rdparty", "tvm", "python") + os.path.dirname(os.path.abspath(__file__)), "..", "3rdparty", "tvm", "python") if os.path.exists(develop_tvm_path) and develop_tvm_path not in sys.path: os.environ["PYTHONPATH"] = develop_tvm_path + ":" + os.environ.get("PYTHONPATH", "") sys.path.insert(0, develop_tvm_path) diff --git a/python/bitblas/base/__init__.py b/bitblas/base/__init__.py similarity index 100% rename from python/bitblas/base/__init__.py rename to bitblas/base/__init__.py diff --git a/python/bitblas/base/analysis.py b/bitblas/base/analysis.py similarity index 100% rename from python/bitblas/base/analysis.py rename to bitblas/base/analysis.py diff --git a/python/bitblas/base/common_schedules.py b/bitblas/base/common_schedules.py similarity index 100% rename from python/bitblas/base/common_schedules.py rename to bitblas/base/common_schedules.py diff --git a/python/bitblas/base/roller/__init__.py b/bitblas/base/roller/__init__.py similarity index 100% rename from python/bitblas/base/roller/__init__.py rename to bitblas/base/roller/__init__.py diff --git a/python/bitblas/base/roller/arch/__init__.py b/bitblas/base/roller/arch/__init__.py similarity index 100% rename from python/bitblas/base/roller/arch/__init__.py rename to bitblas/base/roller/arch/__init__.py diff --git a/python/bitblas/base/roller/arch/arch_base.py b/bitblas/base/roller/arch/arch_base.py similarity index 100% rename from python/bitblas/base/roller/arch/arch_base.py rename to bitblas/base/roller/arch/arch_base.py diff --git a/python/bitblas/base/roller/arch/cpu.py b/bitblas/base/roller/arch/cpu.py similarity index 100% rename from python/bitblas/base/roller/arch/cpu.py rename to bitblas/base/roller/arch/cpu.py diff --git a/python/bitblas/base/roller/arch/cuda.py b/bitblas/base/roller/arch/cuda.py similarity index 100% rename from python/bitblas/base/roller/arch/cuda.py rename to bitblas/base/roller/arch/cuda.py diff --git a/python/bitblas/base/roller/bestfit.py b/bitblas/base/roller/bestfit.py similarity index 100% rename from python/bitblas/base/roller/bestfit.py rename to bitblas/base/roller/bestfit.py diff --git a/python/bitblas/base/roller/hint.py b/bitblas/base/roller/hint.py similarity index 100% rename from python/bitblas/base/roller/hint.py rename to bitblas/base/roller/hint.py diff --git a/python/bitblas/base/roller/node.py b/bitblas/base/roller/node.py similarity index 100% rename from python/bitblas/base/roller/node.py rename to bitblas/base/roller/node.py diff --git a/python/bitblas/base/roller/policy/__init__.py b/bitblas/base/roller/policy/__init__.py similarity index 100% rename from python/bitblas/base/roller/policy/__init__.py rename to bitblas/base/roller/policy/__init__.py diff --git a/python/bitblas/base/roller/policy/common.py b/bitblas/base/roller/policy/common.py similarity index 100% rename from python/bitblas/base/roller/policy/common.py rename to bitblas/base/roller/policy/common.py diff --git a/python/bitblas/base/roller/policy/default.py b/bitblas/base/roller/policy/default.py similarity index 100% rename from python/bitblas/base/roller/policy/default.py rename to bitblas/base/roller/policy/default.py diff --git a/python/bitblas/base/roller/policy/tensorcore.py b/bitblas/base/roller/policy/tensorcore.py similarity index 100% rename from python/bitblas/base/roller/policy/tensorcore.py rename to bitblas/base/roller/policy/tensorcore.py diff --git a/python/bitblas/base/roller/rasterization.py b/bitblas/base/roller/rasterization.py similarity index 100% rename from python/bitblas/base/roller/rasterization.py rename to bitblas/base/roller/rasterization.py diff --git a/python/bitblas/base/roller/shape_inference/__init__.py b/bitblas/base/roller/shape_inference/__init__.py similarity index 100% rename from python/bitblas/base/roller/shape_inference/__init__.py rename to bitblas/base/roller/shape_inference/__init__.py diff --git a/python/bitblas/base/roller/shape_inference/common.py b/bitblas/base/roller/shape_inference/common.py similarity index 100% rename from python/bitblas/base/roller/shape_inference/common.py rename to bitblas/base/roller/shape_inference/common.py diff --git a/python/bitblas/base/roller/shape_inference/tir.py b/bitblas/base/roller/shape_inference/tir.py similarity index 100% rename from python/bitblas/base/roller/shape_inference/tir.py rename to bitblas/base/roller/shape_inference/tir.py diff --git a/python/bitblas/base/schedule_rule.py b/bitblas/base/schedule_rule.py similarity index 100% rename from python/bitblas/base/schedule_rule.py rename to bitblas/base/schedule_rule.py diff --git a/python/bitblas/base/transform.py b/bitblas/base/transform.py similarity index 100% rename from python/bitblas/base/transform.py rename to bitblas/base/transform.py diff --git a/python/bitblas/base/utils.py b/bitblas/base/utils.py similarity index 100% rename from python/bitblas/base/utils.py rename to bitblas/base/utils.py diff --git a/python/bitblas/cache/__init__.py b/bitblas/cache/__init__.py similarity index 100% rename from python/bitblas/cache/__init__.py rename to bitblas/cache/__init__.py diff --git a/python/bitblas/cache/operator.py b/bitblas/cache/operator.py similarity index 100% rename from python/bitblas/cache/operator.py rename to bitblas/cache/operator.py diff --git a/python/bitblas/generator.py b/bitblas/generator.py similarity index 100% rename from python/bitblas/generator.py rename to bitblas/generator.py diff --git a/python/bitblas/gpu/__init__.py b/bitblas/gpu/__init__.py similarity index 100% rename from python/bitblas/gpu/__init__.py rename to bitblas/gpu/__init__.py diff --git a/python/bitblas/gpu/base.py b/bitblas/gpu/base.py similarity index 100% rename from python/bitblas/gpu/base.py rename to bitblas/gpu/base.py diff --git a/python/bitblas/gpu/element_wise.py b/bitblas/gpu/element_wise.py similarity index 100% rename from python/bitblas/gpu/element_wise.py rename to bitblas/gpu/element_wise.py diff --git a/python/bitblas/gpu/fallback.py b/bitblas/gpu/fallback.py similarity index 100% rename from python/bitblas/gpu/fallback.py rename to bitblas/gpu/fallback.py diff --git a/python/bitblas/gpu/gemv.py b/bitblas/gpu/gemv.py similarity index 100% rename from python/bitblas/gpu/gemv.py rename to bitblas/gpu/gemv.py diff --git a/python/bitblas/gpu/gemv_dequantize.py b/bitblas/gpu/gemv_dequantize.py similarity index 100% rename from python/bitblas/gpu/gemv_dequantize.py rename to bitblas/gpu/gemv_dequantize.py diff --git a/python/bitblas/gpu/general_reduction.py b/bitblas/gpu/general_reduction.py similarity index 100% rename from python/bitblas/gpu/general_reduction.py rename to bitblas/gpu/general_reduction.py diff --git a/python/bitblas/gpu/intrin/__init__.py b/bitblas/gpu/intrin/__init__.py similarity index 100% rename from python/bitblas/gpu/intrin/__init__.py rename to bitblas/gpu/intrin/__init__.py diff --git a/python/bitblas/gpu/intrin/lop3.py b/bitblas/gpu/intrin/lop3.py similarity index 100% rename from python/bitblas/gpu/intrin/lop3.py rename to bitblas/gpu/intrin/lop3.py diff --git a/python/bitblas/gpu/matmul.py b/bitblas/gpu/matmul.py similarity index 100% rename from python/bitblas/gpu/matmul.py rename to bitblas/gpu/matmul.py diff --git a/python/bitblas/gpu/matmul_analysis.py b/bitblas/gpu/matmul_analysis.py similarity index 100% rename from python/bitblas/gpu/matmul_analysis.py rename to bitblas/gpu/matmul_analysis.py diff --git a/python/bitblas/gpu/matmul_mma.py b/bitblas/gpu/matmul_mma.py similarity index 100% rename from python/bitblas/gpu/matmul_mma.py rename to bitblas/gpu/matmul_mma.py diff --git a/python/bitblas/gpu/matmul_mma_dequantize.py b/bitblas/gpu/matmul_mma_dequantize.py similarity index 100% rename from python/bitblas/gpu/matmul_mma_dequantize.py rename to bitblas/gpu/matmul_mma_dequantize.py diff --git a/python/bitblas/gpu/matmul_wmma.py b/bitblas/gpu/matmul_wmma.py similarity index 100% rename from python/bitblas/gpu/matmul_wmma.py rename to bitblas/gpu/matmul_wmma.py diff --git a/python/bitblas/gpu/reduction.py b/bitblas/gpu/reduction.py similarity index 100% rename from python/bitblas/gpu/reduction.py rename to bitblas/gpu/reduction.py diff --git a/python/bitblas/gpu/rmsnorm.py b/bitblas/gpu/rmsnorm.py similarity index 100% rename from python/bitblas/gpu/rmsnorm.py rename to bitblas/gpu/rmsnorm.py diff --git a/python/bitblas/gpu/transpose.py b/bitblas/gpu/transpose.py similarity index 100% rename from python/bitblas/gpu/transpose.py rename to bitblas/gpu/transpose.py diff --git a/python/bitblas/gpu/utils.py b/bitblas/gpu/utils.py similarity index 100% rename from python/bitblas/gpu/utils.py rename to bitblas/gpu/utils.py diff --git a/python/bitblas/module/__init__.py b/bitblas/module/__init__.py similarity index 100% rename from python/bitblas/module/__init__.py rename to bitblas/module/__init__.py diff --git a/python/bitblas/ops/__init__.py b/bitblas/ops/__init__.py similarity index 100% rename from python/bitblas/ops/__init__.py rename to bitblas/ops/__init__.py diff --git a/python/bitblas/ops/general_matmul.py b/bitblas/ops/general_matmul.py similarity index 100% rename from python/bitblas/ops/general_matmul.py rename to bitblas/ops/general_matmul.py diff --git a/python/bitblas/ops/general_matmul_splitk.py b/bitblas/ops/general_matmul_splitk.py similarity index 100% rename from python/bitblas/ops/general_matmul_splitk.py rename to bitblas/ops/general_matmul_splitk.py diff --git a/python/bitblas/ops/impl/__init__.py b/bitblas/ops/impl/__init__.py similarity index 100% rename from python/bitblas/ops/impl/__init__.py rename to bitblas/ops/impl/__init__.py diff --git a/python/bitblas/ops/impl/batch_matmul_dequantize_impl.py b/bitblas/ops/impl/batch_matmul_dequantize_impl.py similarity index 100% rename from python/bitblas/ops/impl/batch_matmul_dequantize_impl.py rename to bitblas/ops/impl/batch_matmul_dequantize_impl.py diff --git a/python/bitblas/ops/impl/batch_matmul_impl.py b/bitblas/ops/impl/batch_matmul_impl.py similarity index 100% rename from python/bitblas/ops/impl/batch_matmul_impl.py rename to bitblas/ops/impl/batch_matmul_impl.py diff --git a/python/bitblas/ops/impl/convolution2d_impl.py b/bitblas/ops/impl/convolution2d_impl.py similarity index 100% rename from python/bitblas/ops/impl/convolution2d_impl.py rename to bitblas/ops/impl/convolution2d_impl.py diff --git a/python/bitblas/ops/impl/ladder_permutate_impl.py b/bitblas/ops/impl/ladder_permutate_impl.py similarity index 100% rename from python/bitblas/ops/impl/ladder_permutate_impl.py rename to bitblas/ops/impl/ladder_permutate_impl.py diff --git a/python/bitblas/ops/impl/lop3_permutate_impl.py b/bitblas/ops/impl/lop3_permutate_impl.py similarity index 100% rename from python/bitblas/ops/impl/lop3_permutate_impl.py rename to bitblas/ops/impl/lop3_permutate_impl.py diff --git a/python/bitblas/ops/impl/matmul_dequantize_impl.py b/bitblas/ops/impl/matmul_dequantize_impl.py similarity index 100% rename from python/bitblas/ops/impl/matmul_dequantize_impl.py rename to bitblas/ops/impl/matmul_dequantize_impl.py diff --git a/python/bitblas/ops/impl/matmul_dequantize_splitk_impl.py b/bitblas/ops/impl/matmul_dequantize_splitk_impl.py similarity index 100% rename from python/bitblas/ops/impl/matmul_dequantize_splitk_impl.py rename to bitblas/ops/impl/matmul_dequantize_splitk_impl.py diff --git a/python/bitblas/ops/impl/matmul_impl.py b/bitblas/ops/impl/matmul_impl.py similarity index 100% rename from python/bitblas/ops/impl/matmul_impl.py rename to bitblas/ops/impl/matmul_impl.py diff --git a/python/bitblas/ops/impl/matmul_splitk_impl.py b/bitblas/ops/impl/matmul_splitk_impl.py similarity index 100% rename from python/bitblas/ops/impl/matmul_splitk_impl.py rename to bitblas/ops/impl/matmul_splitk_impl.py diff --git a/python/bitblas/ops/impl/param_permutate_impl.py b/bitblas/ops/impl/param_permutate_impl.py similarity index 100% rename from python/bitblas/ops/impl/param_permutate_impl.py rename to bitblas/ops/impl/param_permutate_impl.py diff --git a/python/bitblas/ops/ladder_permutate.py b/bitblas/ops/ladder_permutate.py similarity index 100% rename from python/bitblas/ops/ladder_permutate.py rename to bitblas/ops/ladder_permutate.py diff --git a/python/bitblas/ops/lop3_permutate.py b/bitblas/ops/lop3_permutate.py similarity index 100% rename from python/bitblas/ops/lop3_permutate.py rename to bitblas/ops/lop3_permutate.py diff --git a/python/bitblas/ops/matmul.py b/bitblas/ops/matmul.py similarity index 100% rename from python/bitblas/ops/matmul.py rename to bitblas/ops/matmul.py diff --git a/python/bitblas/ops/matmul_dequantize.py b/bitblas/ops/matmul_dequantize.py similarity index 100% rename from python/bitblas/ops/matmul_dequantize.py rename to bitblas/ops/matmul_dequantize.py diff --git a/python/bitblas/ops/operator.py b/bitblas/ops/operator.py similarity index 100% rename from python/bitblas/ops/operator.py rename to bitblas/ops/operator.py diff --git a/python/bitblas/ops/param_permutate.py b/bitblas/ops/param_permutate.py similarity index 100% rename from python/bitblas/ops/param_permutate.py rename to bitblas/ops/param_permutate.py diff --git a/python/bitblas/quantization/__init__.py b/bitblas/quantization/__init__.py similarity index 100% rename from python/bitblas/quantization/__init__.py rename to bitblas/quantization/__init__.py diff --git a/python/bitblas/quantization/quantization.py b/bitblas/quantization/quantization.py similarity index 100% rename from python/bitblas/quantization/quantization.py rename to bitblas/quantization/quantization.py diff --git a/python/bitblas/quantization/utils.py b/bitblas/quantization/utils.py similarity index 100% rename from python/bitblas/quantization/utils.py rename to bitblas/quantization/utils.py diff --git a/python/bitblas/relax/op/interleave_weight.py b/bitblas/relax/op/interleave_weight.py similarity index 100% rename from python/bitblas/relax/op/interleave_weight.py rename to bitblas/relax/op/interleave_weight.py diff --git a/python/bitblas/relax/transform/__init__.py b/bitblas/relax/transform/__init__.py similarity index 100% rename from python/bitblas/relax/transform/__init__.py rename to bitblas/relax/transform/__init__.py diff --git a/python/bitblas/relax/transform/annotate_decode_block.py b/bitblas/relax/transform/annotate_decode_block.py similarity index 100% rename from python/bitblas/relax/transform/annotate_decode_block.py rename to bitblas/relax/transform/annotate_decode_block.py diff --git a/python/bitblas/relax/transform/weight_only_propagate.py b/bitblas/relax/transform/weight_only_propagate.py similarity index 100% rename from python/bitblas/relax/transform/weight_only_propagate.py rename to bitblas/relax/transform/weight_only_propagate.py diff --git a/python/bitblas/testing/__init__.py b/bitblas/testing/__init__.py similarity index 100% rename from python/bitblas/testing/__init__.py rename to bitblas/testing/__init__.py diff --git a/python/bitblas/utils/__init__.py b/bitblas/utils/__init__.py similarity index 100% rename from python/bitblas/utils/__init__.py rename to bitblas/utils/__init__.py diff --git a/python/bitblas/utils/post_process.py b/bitblas/utils/post_process.py similarity index 100% rename from python/bitblas/utils/post_process.py rename to bitblas/utils/post_process.py diff --git a/python/bitblas/utils/target_detector.py b/bitblas/utils/target_detector.py similarity index 100% rename from python/bitblas/utils/target_detector.py rename to bitblas/utils/target_detector.py diff --git a/python/bitblas/utils/tensor_adapter.py b/bitblas/utils/tensor_adapter.py similarity index 100% rename from python/bitblas/utils/tensor_adapter.py rename to bitblas/utils/tensor_adapter.py diff --git a/python/bitblas/wrapper/__init__.py b/bitblas/wrapper/__init__.py similarity index 100% rename from python/bitblas/wrapper/__init__.py rename to bitblas/wrapper/__init__.py diff --git a/python/bitblas/wrapper/general.py b/bitblas/wrapper/general.py similarity index 100% rename from python/bitblas/wrapper/general.py rename to bitblas/wrapper/general.py diff --git a/install.sh b/install.sh new file mode 100755 index 000000000..584392820 --- /dev/null +++ b/install.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# install requirements +pip install -r requirements.txt + +# install llvm +apt-get install llvm-10 + +# clone and build tvm +git submodule update --init --recursive + +cd 3rdparty/tvm +mkdir build +cp cmake/config.cmake build +cd build +echo "set(USE_LLVM llvm-config-10)" >> config.cmake && echo "set(USE_CUDA ON)" >> config.cmake + +cmake .. && make -j && cd ../../.. + +echo "export TVM_HOME=$(pwd)/3rdparty/tvm" >> ~/.bashrc +echo "export PYTHONPATH=\$TVM_HOME/python:$(pwd)/python:\$PYTHONPATH" >> ~/.bashrc + +source ~/.bashrc diff --git a/python/bitblas_cli.py b/python/bitblas_cli.py deleted file mode 100644 index 59e481eb9..000000000 --- a/python/bitblas_cli.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. diff --git a/testing/python/dsl/test_auto_normalized_tensorcore.py b/testing/python/dsl/test_auto_normalized_tensorcore.py deleted file mode 100644 index eb6e0baef..000000000 --- a/testing/python/dsl/test_auto_normalized_tensorcore.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. -import numpy as np -import tvm -from bitblas.base.roller.policy import TensorCorePolicy, DefaultPolicy -from bitblas.base.roller.arch import CUDA -from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags -from bitblas.gpu import Matmul -from bitblas.ops.impl.convolution2d_impl import conv2d_nhwc_hwio, conv2d_nhwc_ohwi -from bitblas.base.utils import apply_and_build -import time - -benchmark_sets = [ - # (prim_func, input_args, default_bitblas_schedule), - (conv2d_nhwc_hwio, (128, 64, 224, 224, 3, 7, 7, 2, 1, 3, "float16", "float16"), Matmul), - (conv2d_nhwc_ohwi, (128, 64, 56, 56, 64, 3, 3, 1, 1, 1, "float16", "float16"), Matmul), - (conv2d_nhwc_hwio, (128, 64, 56, 56, 64, 1, 1, 1, 1, 1, "float16", "float16"), Matmul), - (conv2d_nhwc_ohwi, (128, 64, 56, 56, 64, 1, 1, 1, 1, 1, "float16", "float16"), Matmul), - (conv2d_nhwc_ohwi, (128, 128, 28, 28, 128, 3, 3, 1, 1, 1, "float16", "float16"), Matmul), - (conv2d_nhwc_hwio, (128, 256, 14, 14, 128, 3, 3, 2, 1, 1, "float16", "float16"), Matmul), - (conv2d_nhwc_ohwi, (128, 256, 14, 14, 128, 1, 1, 2, 1, 1, "float16", "float16"), Matmul), -] -benchmark_results = {} -for get_prim_func, input_args, d_schedule in benchmark_sets: - ir_module = get_prim_func(*input_args) - func = ir_module["main"] - target = tvm.target.Target("nvidia/nvidia-a100") - arch = CUDA(target) - policy = DefaultPolicy(func=func, arch=arch) - tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) - try: - tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target) - except Exception as e: - print(f"Failed to get tensorized function and tags: {e}") - tags = None - if tags: - policy = TensorCorePolicy(func=tensorized_func, arch=arch, tags=tags) - - configs = policy.emit_config(20) - - tune_start = time.time() - cpresults, best = apply_and_build(func, configs, arch, parallel_build=True) - fast_tune_time = time.time() - tune_start - print("[BitBLAS] The best latency of top 1 is {:.3f} ms".format(cpresults[0].latency * 1e3)) - print("[BitBLAS] The best latency of top 20 is {:.3f} ms".format(best.latency * 1e3)) - - # evaluate the performance of the default schedule - - rule = d_schedule() - default_tune_start = time.time() - sch_default = rule.apply(func, target, False) - with tvm.transform.PassContext(config={"tir.use_async_copy": True}): - mod_default = tvm.build(sch_default.mod["main"], target="cuda") - default_tune_time = time.time() - default_tune_start - - args = func.buffer_map.values() - - profile_tensors = [] - for arg in args: - profile_tensors.append( - tvm.nd.array( - np.random.uniform(0, 1, [int(i) for i in arg.shape]).astype(arg.dtype), - device=arch.device, - )) - - timer_cuda_mod = mod_default.time_evaluator(mod_default.entry_name, arch.device, number=5) - t = timer_cuda_mod(*profile_tensors).mean - - print("Time cost of BitBLAS default schedule: {:.3f} ms".format(t * 1e3)) - - profile_config = { - f"{get_prim_func.__name__}-{'-'.join([str(i) for i in input_args])}": { - "fast_bitblas_top20_tune_time": fast_tune_time, - "fast_bitblas_top1_latency": cpresults[0].latency * 1e3, - "fast_bitblas_top20_latency": best.latency * 1e3, - "default_bitblas_tune_time": default_tune_time, - "default_bitblas_latency": t * 1e3, - } - } - benchmark_results.update(profile_config) - -headers = [ - "PrimFunc", - "Input Arguments", - "FastDLight Top20 Tune Time", - "FastDLight Top1 Latency", - "FastDLight Top20 Latency", - "DefaultDLight Tune Time", - "DefaultDLight Latency", -] - -col_width = (max(len(word) for row in [headers] + list(profile_config.values()) for word in row) + 2 - ) # padding - -print("".join(word.ljust(col_width) for word in headers)) - -print("-" * col_width * len(headers)) - -for config, values in benchmark_results.items(): - args = config.split("-") - func_name = args[0] - input_args = "-".join(args[1:]) - row = [ - func_name, - input_args, - f" {str(values['fast_bitblas_top20_tune_time'])} s", - f"{values['fast_bitblas_top1_latency']:.3f} ms", - f"{values['fast_bitblas_top20_latency']:.3f} ms", - str(values["default_bitblas_tune_time"]), - f"{values['default_bitblas_latency']:.3f} ms", - ] - print("".join(word.ljust(col_width) for word in row))