Skip to content

Commit

Permalink
[Benchmarks] Make detokenization optional in benchmark scripts (#11697)
Browse files Browse the repository at this point in the history
Signed-off-by: Jeremy Arnold <[email protected]>
  • Loading branch information
JArnoldAMD authored Mar 7, 2025
1 parent f7ebad2 commit 58abe35
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 7 deletions.
7 changes: 7 additions & 0 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def main(args: argparse.Namespace):
top_p=1.0,
ignore_eos=True,
max_tokens=args.output_len,
detokenize=not args.disable_detokenize,
)
print(sampling_params)
dummy_prompt_token_ids = np.random.randint(10000,
Expand Down Expand Up @@ -173,6 +174,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
default=None,
help="Path to save the latency results in JSON format.",
)
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)

parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
Expand Down
10 changes: 9 additions & 1 deletion benchmarks/benchmark_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,9 @@ def main(args):

llm = LLM(**dataclasses.asdict(engine_args))

sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
sampling_params = SamplingParams(temperature=0,
max_tokens=args.output_len,
detokenize=not args.disable_detokenize)

print("Testing filtered requests")
prompts = repeat_and_sort_requests(filtered_requests,
Expand Down Expand Up @@ -243,6 +245,12 @@ def main(args):
"subtract this length when filtering prompts. Only used "
"when dataset-path is not provided.",
)
parser.add_argument(
'--disable-detokenize',
action='store_true',
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)

parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
Expand Down
13 changes: 11 additions & 2 deletions benchmarks/benchmark_prioritization.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def sample_requests(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
) -> list[tuple[str, int, int]]:
) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")

Expand Down Expand Up @@ -71,6 +71,7 @@ def run_vllm(
requests: list[tuple[str, int, int]],
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
Expand All @@ -95,6 +96,7 @@ def run_vllm(
top_p=1.0,
ignore_eos=True,
max_tokens=output_len,
detokenize=not disable_detokenize,
))

start = time.perf_counter()
Expand All @@ -121,7 +123,8 @@ def main(args: argparse.Namespace):

if args.backend == "vllm":
elapsed_time = run_vllm(requests, args.n,
EngineArgs.from_cli_args(args))
EngineArgs.from_cli_args(args),
args.disable_detokenize)
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len
Expand Down Expand Up @@ -174,6 +177,12 @@ def main(args: argparse.Namespace):
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
parser.add_argument(
'--disable-detokenize',
action='store_true',
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)

parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
Expand Down
22 changes: 18 additions & 4 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def run_vllm(
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
Expand All @@ -194,6 +195,7 @@ def run_vllm(
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
lora_requests: Optional[list[LoRARequest]] = None
if engine_args.enable_lora:
Expand Down Expand Up @@ -232,6 +234,7 @@ async def run_vllm_async(
n: int,
engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
disable_detokenize: bool = False,
) -> float:
from vllm import SamplingParams

Expand Down Expand Up @@ -262,6 +265,7 @@ async def run_vllm_async(
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
lora_requests.append(request.lora_request)

Expand All @@ -288,6 +292,7 @@ def run_hf(
n: int,
max_batch_size: int,
trust_remote_code: bool,
disable_detokenize: bool = False,
) -> float:
llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
Expand Down Expand Up @@ -327,8 +332,9 @@ def run_hf(
use_cache=True,
max_new_tokens=max_output_len,
)
# Include the decoding time.
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
if not disable_detokenize:
# Include the decoding time.
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
pbar.update(len(batch))

# Clear the batch.
Expand Down Expand Up @@ -440,14 +446,17 @@ def main(args: argparse.Namespace):
args.n,
AsyncEngineArgs.from_cli_args(args),
args.disable_frontend_multiprocessing,
args.disable_detokenize,
))
else:
elapsed_time = run_vllm(requests, args.n,
EngineArgs.from_cli_args(args))
EngineArgs.from_cli_args(args),
args.disable_detokenize)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.hf_max_batch_size, args.trust_remote_code)
args.hf_max_batch_size, args.trust_remote_code,
args.disable_detokenize)
elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
args.output_len)
Expand Down Expand Up @@ -526,6 +535,11 @@ def main(args: argparse.Namespace):
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"))
# LoRA
parser.add_argument(
"--lora-path",
Expand Down

0 comments on commit 58abe35

Please sign in to comment.