Skip to content

Commit 6b76adb

Browse files
authored
Micro-benchmark inference (#1759)
1 parent c17536e commit 6b76adb

13 files changed

+1085
-0
lines changed

benchmarks/__init__.py

Whitespace-only changes.

benchmarks/microbenchmarks/README.md

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# Microbenchmarks
2+
3+
This directory contains microbenchmarking tools for measuring inference performance across different quantization methods and model architectures.
4+
5+
## Overview
6+
7+
The microbenchmarking system works as follows:
8+
9+
![Microbenchmarks Process Flow](../../docs/static/microbenchmarking_process_diagram.png)
10+
11+
## Components
12+
13+
![Microbenchmarks Flow](../../docs/static/microbenchmarks_code_flow_diagram.png)
14+
15+
- **benchmark_runner.py**: Main entry point that orchestrates the benchmarking process
16+
- **benchmark_inference.py**: Handles model creation and inference benchmarking
17+
- **utils.py**: Contains utility functions and configuration classes
18+
- **test\/**: Test files and sample configurations
19+
20+
## Usage
21+
22+
1. Create a configuration YAML file (see example below)
23+
2. Run the benchmark using:
24+
25+
```bash
26+
python -m benchmarks.microbenchmarks.benchmark_runner --config path/to/config.yml
27+
```
28+
29+
### Example Configuration
30+
31+
```yaml
32+
# Sample configuration for inference benchmarks
33+
quantization_config_recipe_names:
34+
- "baseline"
35+
- "int8wo"
36+
- "int4wo-128"
37+
- "int4wo-128-hqq"
38+
39+
output_dir: "benchmarks/microbenchmarks/results"
40+
41+
model_params:
42+
matrix_shapes:
43+
- name: "custom"
44+
shapes: [
45+
[1024, 1024, 1024], # [m, k, n]
46+
[2048, 4096, 1024],
47+
[4096, 4096, 1024]
48+
]
49+
high_precision_dtype: "torch.bfloat16"
50+
compile: "max-autotune" # Options: "default", "max-autotune", "false"
51+
device: "cuda" # Options: "cuda", "mps", "xpu", "cpu"
52+
model_type: "linear" # Options: "linear", "ln_linear_sigmoid"
53+
```
54+
55+
## Configuration Options
56+
57+
### Quantization Methods
58+
Currently, quantization string is in same format as the one being passed in llama/generate.py.
59+
- `baseline`: No quantization
60+
- `int8wo`: 8-bit weight-only quantization
61+
- `int4wo-{group_size}`: 4-bit weight-only quantization with specified group size
62+
- `int4wo-{group_size}-hqq`: 4-bit weight-only quantization with HQQ
63+
64+
### Model Types
65+
- `linear`: Simple linear layer
66+
- `ln_linear_sigmoid`: LayerNorm + Linear + Sigmoid
67+
68+
### Device Options
69+
- `cuda`: NVIDIA GPU
70+
- `xpu`: Intel GPU
71+
- `mps`: Apple Silicon GPU
72+
- `cpu`: CPU fallback
73+
74+
## Output
75+
76+
Results are saved to a CSV file in the specified output directory
77+
78+
## Running Tests
79+
80+
To run the test suite:
81+
82+
```bash
83+
python -m unittest discover benchmarks/microbenchmarks/test
84+
```

benchmarks/microbenchmarks/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
4+
# This source code is licensed under the license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
"""
7+
Inference benchmark runner
8+
9+
This script runs inference benchmarks and generates a micro-benchmarking report for it.
10+
- run() function is the main entry point for running inference benchmarks.
11+
"""
12+
13+
from copy import deepcopy
14+
from pathlib import Path
15+
16+
import torch
17+
18+
from benchmarks.microbenchmarks.utils import (
19+
BenchmarkConfig,
20+
BenchmarkResult,
21+
clean_caches,
22+
create_model_and_input,
23+
model_inference_time_in_ms,
24+
string_to_config,
25+
)
26+
from torchao.quantization import quantize_
27+
28+
29+
def run(config: BenchmarkConfig) -> BenchmarkResult:
30+
"""Run inference benchmarks"""
31+
clean_caches() # Clean caches
32+
33+
# Create output directory if it doesn't exist
34+
Path(config.output_dir).mkdir(parents=True, exist_ok=True)
35+
36+
base_model, input_data = create_model_and_input(
37+
config.model_type,
38+
config.m,
39+
config.k,
40+
config.n,
41+
high_precision_dtype=config.high_precision_dtype,
42+
device=config.device,
43+
)
44+
45+
# Use quantize_ to apply each quantization function to the model
46+
m_copy = deepcopy(base_model).eval().to(config.device)
47+
quantization_config = string_to_config(
48+
config.quantization, high_precision_dtype=config.high_precision_dtype
49+
)
50+
if quantization_config is not None:
51+
quantize_(m_copy, quantization_config)
52+
if config.use_torch_compile:
53+
print("Compiling model....")
54+
m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
55+
56+
# Run benchmarks
57+
result = BenchmarkResult(config=config)
58+
59+
# Benchmark time to run an inference call for quantized model
60+
result.model_inference_time_in_ms = model_inference_time_in_ms(
61+
model=m_copy, input_data=input_data
62+
)
63+
64+
# TODO: Benchmark time using profiler
65+
# Profile dtype model evaluation
66+
# prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype)
67+
# prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json") # Save profiling details
68+
69+
# TODO: Benchmark gemm time using cuda graph
70+
# gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs)
71+
72+
# TODO: Benchmark op with cuda graph
73+
# time = benchmark_op_with_cuda_graph(op, args)
74+
75+
return result
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
4+
# This source code is licensed under the license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
"""
7+
Benchmark Runner
8+
9+
This is the main entry point for the benchmarking application. It reads the YAML configuration
10+
file and orchestrates the entire benchmarking process by:
11+
- Loading and validating benchmark configurations
12+
- Executing benchmark scenarios
13+
- Collecting and processing results
14+
- Generating reports
15+
16+
Usage:
17+
python benchmark_runner.py [config.yaml]
18+
19+
The YAML file should contain all necessary configuration parameters for the benchmarks.
20+
"""
21+
22+
import argparse
23+
from itertools import product
24+
from typing import Any, Dict, List, Tuple
25+
26+
import yaml
27+
28+
from benchmarks.microbenchmarks.utils import (
29+
BenchmarkConfig,
30+
generate_results_csv,
31+
print_results,
32+
)
33+
34+
35+
def get_shapes_for_config(
36+
shape_configs: List[Dict[str, Any]],
37+
) -> List[Tuple[str, List[int]]]:
38+
"""Get shapes for a given configuration.
39+
40+
Args:
41+
shape_configs: List of shape configurations from YAML
42+
43+
Returns:
44+
List of tuples containing (shape_name, shape)
45+
"""
46+
shapes = []
47+
for shape_config in shape_configs:
48+
name = shape_config["name"]
49+
if name == "custom":
50+
shapes.extend([(name, shape) for shape in shape_config["shapes"]])
51+
else:
52+
raise NotImplementedError(
53+
f"Shape config {name} not supported. Currently only supports custom shapes."
54+
)
55+
return shapes
56+
57+
58+
def get_param_combinations(model_param):
59+
"""Extract all parameter combinations from a model config"""
60+
# Get all shapes
61+
shapes = get_shapes_for_config(model_param["matrix_shapes"])
62+
63+
# Extract all other parameters (excluding matrix_shapes)
64+
base_params = {
65+
key: value for key, value in model_param.items() if key not in ["matrix_shapes"]
66+
}
67+
68+
return shapes, base_params
69+
70+
71+
def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig]:
72+
"""Load benchmark configurations from CLI arguments and YAML file."""
73+
with open(cli_args.config, "r") as f:
74+
config = yaml.safe_load(f)
75+
76+
output_dir = config.get("output_dir", "benchmarks/microbenchmarks/results")
77+
benchmark_mode = config.get("benchmark_mode", "inference")
78+
79+
# Create all possible combinations
80+
configs = []
81+
for model_param in config["model_params"]:
82+
shapes, params = get_param_combinations(model_param)
83+
84+
# Create configs for all combinations
85+
for quant_config, (shape_name, shape) in product(
86+
config.get("quantization_config_recipe_names", ["baseline"]), shapes
87+
):
88+
configs.append(
89+
BenchmarkConfig(
90+
quantization=quant_config,
91+
params=params,
92+
shape_name=shape_name,
93+
shape=shape,
94+
output_dir=output_dir,
95+
benchmark_mode=benchmark_mode,
96+
)
97+
)
98+
99+
return configs
100+
101+
102+
def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None:
103+
"""Run benchmarks using configurations from YAML file"""
104+
from benchmarks.microbenchmarks.benchmark_inference import run as run_inference
105+
106+
results = []
107+
print("Benchmarking Inference ......")
108+
for config in configs:
109+
try:
110+
print(f"Running: {config.name}")
111+
result = run_inference(config) # Pass the config object directly
112+
results.append(result)
113+
except Exception as e:
114+
print(f"Error running benchmark {config.name}: {e}")
115+
continue
116+
117+
# Add results to csv
118+
generate_results_csv(results, configs[0].output_dir)
119+
120+
# Print results
121+
print_results(results)
122+
123+
# TODO: Process results: Speedups:
124+
# 1. For different shapes for same model and quantization
125+
# 2. For different quantizations for same model and shape
126+
# 3. For different models for same quantization
127+
128+
129+
if __name__ == "__main__":
130+
import argparse
131+
132+
parser = argparse.ArgumentParser(description="Run benchmarks from config file")
133+
parser.add_argument(
134+
"--config",
135+
type=str,
136+
required=True,
137+
help="Path to benchmark configuration file",
138+
)
139+
# TODO: Add support for args to override config values and run smaller benchmarks
140+
args = parser.parse_args()
141+
142+
configs = load_benchmark_configs(cli_args=args)
143+
# Run benchmarks
144+
if configs[0].benchmark_mode == "inference":
145+
run_inference_benchmarks_from_config(configs)
146+
elif configs[0].benchmark_mode == "training":
147+
print("Training mode not implemented yet")
148+
else:
149+
raise ValueError(
150+
f"Invalid benchmark mode: {configs[0].benchmark_mode}, choose from inference or training"
151+
)
152+
153+
# TODO: Add support for args to override config values and run smaller benchmarks

benchmarks/microbenchmarks/test/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Sample configuration for inference benchmarks
2+
benchmark_mode: "inference"
3+
quantization_config_recipe_names:
4+
- "baseline"
5+
- "int4wo-32"
6+
- "int4wo-128"
7+
output_dir: "benchmarks/microbenchmarks/results"
8+
model_params:
9+
- name: "small_bf16_linear"
10+
matrix_shapes:
11+
- name: "custom"
12+
shapes: [
13+
[1024, 1024, 1024], # [m, k, n]
14+
]
15+
high_precision_dtype: "torch.bfloat16"
16+
use_torch_compile: true
17+
torch_compile_mode: "max-autotune"
18+
device: "cuda"
19+
model_type: "linear"
20+
21+
- name: "large_bf16_ln_linear"
22+
matrix_shapes:
23+
- name: "custom"
24+
shapes: [
25+
[2048, 4096, 1024],
26+
[4096, 4096, 1024]
27+
]
28+
high_precision_dtype: "torch.bfloat16"
29+
use_torch_compile: true
30+
torch_compile_mode: "max-autotune"
31+
device: "cuda"
32+
model_type: "ln_linear_sigmoid"
33+
34+
- name: "cpu_fp32_linear"
35+
matrix_shapes:
36+
- name: "custom"
37+
shapes: [
38+
[4096, 4096, 1024]
39+
]
40+
high_precision_dtype: "torch.float32"
41+
use_torch_compile: false
42+
device: "cpu"
43+
model_type: "linear"

0 commit comments

Comments
 (0)