Skip to content

Commit

Permalink
tqdm
Browse files Browse the repository at this point in the history
  • Loading branch information
LeiWang1999 committed Aug 9, 2024
1 parent 4289d7b commit 2327e14
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 199 deletions.
225 changes: 111 additions & 114 deletions benchmark/operators/benchmark_matmul_scope_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,123 +9,118 @@
from bitblas.gpu.matmul_analysis import get_tensorized_func_and_tags
from bitblas.base.utils import apply_and_build


# Initialize the parser
parser = argparse.ArgumentParser(
description="Benchmark BitBLAS int4 on a specific target."
)

# Add arguments to the parser
parser.add_argument(
"--target",
type=str,
default=auto_detect_nvidia_target(),
help="Specify the target device for benchmarking."
)
parser.add_argument(
"--group_size",
type=int,
default=None,
help="Group size for grouped quantization."
)
parser.add_argument(
"--A_dtype",
type=str,
default="float16",
choices=["float16", "float32", "float64", "int32", "int8"], # Assuming these are the valid choices
help="Data type of activation A."
)
parser.add_argument(
"--W_dtype",
type=str,
default="int4",
choices=["float16", "float32", "float64", "int32", "int8", "int4", "int2", "int1", "nf4", "fp4_e2m1"], # Assuming these are the valid choices
help="Data type of weight W."
)
parser.add_argument(
"--accum_dtype",
type=str,
default="float16",
# Initialize the parser
parser = argparse.ArgumentParser(description="Benchmark BitBLAS int4 on a specific target.")

# Add arguments to the parser
parser.add_argument(
"--target",
type=str,
default=auto_detect_nvidia_target(),
help="Specify the target device for benchmarking.")
parser.add_argument(
"--group_size", type=int, default=None, help="Group size for grouped quantization.")
parser.add_argument(
"--A_dtype",
type=str,
default="float16",
choices=["float16", "float32", "float64", "int32",
"int8"], # Assuming these are the valid choices
help="Data type of activation A.")
parser.add_argument(
"--W_dtype",
type=str,
default="int4",
choices=[
"float16", "float32", "float64", "int32", "int8", "int4", "int2", "int1", "nf4", "fp4_e2m1"
], # Assuming these are the valid choices
help="Data type of weight W.")
parser.add_argument(
"--accum_dtype",
type=str,
default="float16",
choices=["float16", "int32"], # Assuming these are the valid choices
help="Data type for accumulation."
)
parser.add_argument(
"--out_dtype",
type=str,
default="float16",
help="Data type for accumulation.")
parser.add_argument(
"--out_dtype",
type=str,
default="float16",
choices=["float16", "float32", "int32", "int8"], # Assuming these are the valid choices
help="Data type for output."
)
parser.add_argument(
"--layout",
type=str,
default="nt",
help="Data type for output.")
parser.add_argument(
"--layout",
type=str,
default="nt",
choices=["nt", "nn"], # Assuming these are the valid choices
help="Matrix layout, 'nt' for non-transpose A and transpose W."
)
parser.add_argument(
"--with_bias",
action="store_true",
help="Include bias in the benchmark."
)
parser.add_argument(
"--with_scaling",
action="store_true",
help="Include scaling factor in the quantization."
)
parser.add_argument(
"--with_zeros",
action="store_true",
help="Include zeros in the quantization."
)
parser.add_argument(
"--zeros_mode",
type=str,
default=None,
help="Matrix layout, 'nt' for non-transpose A and transpose W.")
parser.add_argument("--with_bias", action="store_true", help="Include bias in the benchmark.")
parser.add_argument(
"--with_scaling", action="store_true", help="Include scaling factor in the quantization.")
parser.add_argument("--with_zeros", action="store_true", help="Include zeros in the quantization.")
parser.add_argument(
"--zeros_mode",
type=str,
default=None,
choices=["original", "rescale", "quantized"], # Replace with actual modes if applicable
help="Specify the mode for calculating zeros."
)

# Parse the arguments
args = parser.parse_args()

# Assign arguments to variables
target = args.target
group_size = args.group_size
A_dtype = args.A_dtype
W_dtype = args.W_dtype
accum_dtype = args.accum_dtype
out_dtype = args.out_dtype
layout = args.layout
with_bias = args.with_bias
group_size = args.group_size
with_scaling = args.with_scaling
with_zeros = args.with_zeros
zeros_mode = args.zeros_mode
help="Specify the mode for calculating zeros.")

test_shapes = [
# square test
(MatmulConfig, Matmul, (1, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (16, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (32, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (64, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (128, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (256, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (1024, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (16, 43008, 14336, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (32, 14336, 14336, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (64, 57344, 14336, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (128, 14336, 57344, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
# Parse the arguments
args = parser.parse_args()

(MatmulConfig, Matmul, (256, 9216, 9216, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (128, 36864, 9216, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (64, 9216, 36864, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (32, 22016, 8192, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
# Assign arguments to variables
target = args.target
group_size = args.group_size
A_dtype = args.A_dtype
W_dtype = args.W_dtype
accum_dtype = args.accum_dtype
out_dtype = args.out_dtype
layout = args.layout
with_bias = args.with_bias
group_size = args.group_size
with_scaling = args.with_scaling
with_zeros = args.with_zeros
zeros_mode = args.zeros_mode

(MatmulConfig, Matmul, (16, 8192, 22016, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (32, 8192, 8192, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (64, 28672, 8192, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (128, 8192, 28672, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
test_shapes = [
# square test
(MatmulConfig, Matmul, (1, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (16, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (32, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (64, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (128, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (256, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (1024, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (16, 43008, 14336, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (32, 14336, 14336, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (64, 57344, 14336, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (128, 14336, 57344, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (256, 9216, 9216, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (128, 36864, 9216, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (64, 9216, 36864, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (32, 22016, 8192, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (16, 8192, 22016, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (32, 8192, 8192, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (64, 28672, 8192, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
(MatmulConfig, Matmul, (128, 8192, 28672, A_dtype, W_dtype, out_dtype, accum_dtype, layout,
with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
]

benchmark_sets = []
Expand All @@ -152,7 +147,7 @@
for config in configs:
static_config = config
static_config.shared_scope = "shared"
static_configs.append(static_config)
static_configs.append(static_config)
dynamic_configs = []
for config in configs:
dynamic_config = config
Expand All @@ -162,8 +157,10 @@
_, best_static = apply_and_build(func, static_configs, arch, parallel_build=True)

_, best_dynamic = apply_and_build(func, dynamic_configs, arch, parallel_build=True)
benchmark_results[input_args] = (best_static.latency, best_dynamic.latency, best_static.latency - best_dynamic.latency)
benchmark_results[input_args] = (best_static.latency, best_dynamic.latency,
best_static.latency - best_dynamic.latency)

for key, value in benchmark_results.items():
print(f"Input arguments: {key}, Static latency: {value[0]}, Dynamic latency: {value[1]}, Difference: {value[2]}")

print(
f"Input arguments: {key}, Static latency: {value[0]}, Dynamic latency: {value[1]}, Difference: {value[2]}"
)
102 changes: 17 additions & 85 deletions benchmark/operators/benchmark_matmul_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from bitblas.utils import get_commit_id
from bitblas import set_log_level
from tabulate import tabulate
import json
from os import path, makedirs
from typing import Dict, List, Union
from typing import List
import argparse
from tqdm import tqdm

set_log_level("DEBUG")

Expand Down Expand Up @@ -105,82 +105,6 @@ def generate_operator_config(self, name: str, M, N, K) -> MatmulConfig:
**self.config_map[name],
)

def serialize_results(self) -> None:
"""Serialize benchmark results into JSON files."""
commit_id_path = f"CommitID_{self.CURRENT_COMMIT_ID}"
log_commit_path = path.join(self.log_path, commit_id_path)

if not path.exists(log_commit_path):
makedirs(log_commit_path)

# Save benchmark results into JSON
self._save_json(
self.benchmark_results,
path.join(log_commit_path, self.BENCHMARK_RESULTS_FILE),
)

# Save benchmark shapes into JSON
shapes: Dict[List[List[Union[List, int], int, int]]] = {}

# Iterate through the benchmark results to extract the shapes
for name, results in self.benchmark_results.items():
shapes[name] = []
for i, _ in enumerate(results):
config = self.benchmark_sets[name][i][1]
dyn_prof_shape = self.benchmark_sets[name][i][2]
shapes[name].append([config.M, config.N, config.K, dyn_prof_shape])

self._save_json(shapes, path.join(log_commit_path, self.BENCHMARK_SHAPES_FILE))

# Save device info into JSON
self._save_json(
{"device": self.benchmark_target},
path.join(log_commit_path, self.BENCHMARK_DEVICE_FILE),
)

def _save_json(self, data, file_path):
"""Helper function to save JSON data to a file."""
with open(file_path, "w") as f:
json.dump(data, f)

@classmethod
def deserialize_from_logs(cls, commit_id: str) -> None:
"""Deserialize benchmark results from JSON files."""
benchmark = cls()
commit_id_path = f"CommitID_{commit_id}"
log_commit_path = path.join(benchmark.log_path, commit_id_path)

benchmark.benchmark_results = cls._load_json(
path.join(log_commit_path, cls.BENCHMARK_RESULTS_FILE))

shapes_file = path.join(log_commit_path, cls.BENCHMARK_SHAPES_FILE)

with open(shapes_file, "r") as f:
shapes = json.load(f)
for name, shape_list in shapes.items():
for shape in shape_list:
M, N, K, dyn_prof_shape = shape
benchmark.add_benchmark_set(
name,
[
benchmark.generate_op_unit(
benchmark.generate_operator_config(name, M, N, K),
dynamic_profiling_shape=dyn_prof_shape,
)
],
)

benchmark.benchmark_target = cls._load_json(
path.join(log_commit_path, cls.BENCHMARK_DEVICE_FILE))["device"]

return benchmark

@staticmethod
def _load_json(file_path):
"""Helper function to load JSON data from a file."""
with open(file_path, "r") as f:
return json.load(f)

def report(self):
"""Generate and print a report of the benchmark results."""
results4compare = {}
Expand Down Expand Up @@ -271,13 +195,21 @@ def make_operator(self, operator: Matmul, config: MatmulConfig) -> Matmul:

def benchmark(self):
"""Run benchmarks on all benchmark sets."""
for name, benchmark_set in self.benchmark_sets.items():
self.benchmark_results[name] = []
for op, config, _ in benchmark_set:
for opt in self.OPT_SHAPES:
print(f"Running benchmark for {name} with shape {opt}")
self.benchmark_results[name].extend(
[self.run_benchmark(op, config, {"m": opt})])
# Calculate the total number of benchmark runs for the progress bar
total_runs = sum(
len(benchmark_set) * len(self.OPT_SHAPES)
for benchmark_set in self.benchmark_sets.values())

with tqdm(total=total_runs, desc="Total Progress", unit="benchmark") as pbar:
for name, benchmark_set in self.benchmark_sets.items():
self.benchmark_results[name] = []
for op, config, _ in benchmark_set:
for opt in self.OPT_SHAPES:
print(f"Running benchmark for {name} with shape {opt}")
self.benchmark_results[name].extend(
[self.run_benchmark(op, config, {"m": opt})])
# Update the progress bar after each run
pbar.update(1)

def run_compare_strategy(self, report=True, serialize=True, enable_tuning: bool = False):
"""Run the benchmark process."""
Expand Down

0 comments on commit 2327e14

Please sign in to comment.