Skip to content

Commit

Permalink
[Frontend] Add /v1/audio/transcriptions OpenAI API endpoint (vllm-p…
Browse files Browse the repository at this point in the history
  • Loading branch information
NickLucche authored Feb 13, 2025
1 parent 37dfa60 commit d84cef7
Show file tree
Hide file tree
Showing 20 changed files with 910 additions and 19 deletions.
12 changes: 10 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

Expand Down Expand Up @@ -205,7 +205,7 @@ steps:
- VLLM_USE_V1=1 pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/test_accuracy.py::test_lm_eval_accuracy_v1_engine
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

- label: Examples Test # 25min
working_dir: "/vllm-workspace/examples"
Expand Down Expand Up @@ -339,6 +339,14 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1

- label: OpenAI API correctness
source_file_dependencies:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/

- label: Encoder Decoder tests # 5min
source_file_dependencies:
- vllm/
Expand Down
13 changes: 13 additions & 0 deletions docs/source/serving/openai_compatible_server.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ We currently support the following OpenAI APIs:
- *Note: `parallel_tool_calls` and `user` parameters are ignored.*
- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
- Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
- Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).

In addition, we have the following custom APIs:

Expand Down Expand Up @@ -296,6 +298,17 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
:end-before: end-chat-embedding-extra-params
:::

(transcriptions-api)=

### Transcriptions API

Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.

<!-- TODO: api enforced limits + uploading audios -->

Code example: <gh-file:examples/online_serving/openai_transcription_client.py>

(tokenizer-api)=

### Tokenizer API
Expand Down
23 changes: 23 additions & 0 deletions examples/online_serving/openai_transcription_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# SPDX-License-Identifier: Apache-2.0
from openai import OpenAI

from vllm.assets.audio import AudioAsset

mary_had_lamb = AudioAsset('mary_had_lamb').get_local_path()
winning_call = AudioAsset('winning_call').get_local_path()

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
with open(str(mary_had_lamb), "rb") as f:
transcription = client.audio.transcriptions.create(
file=f,
model="openai/whisper-large-v3",
language="en",
response_format="text",
temperature=0.0)
print("transcription result:", transcription)
7 changes: 3 additions & 4 deletions requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@ py-cpuinfo
transformers >= 4.48.2 # Required for Bamba model and Transformers backend.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
fastapi[standard] >= 0.107.0, < 0.113.0; python_version < '3.9'
fastapi[standard] >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
aiohttp
openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
uvicorn[standard]
pydantic >= 2.9 # Required for fastapi >= 0.113.0
pydantic >= 2.9
prometheus_client >= 0.18.0
pillow # Required for image processing
prometheus-fastapi-instrumentator >= 7.0.0
Expand Down
1 change: 1 addition & 0 deletions requirements-test.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pqdm
ray[adag]==2.40.0
sentence-transformers # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
timm # required for internvl test
torch==2.5.1
torchaudio==2.5.1
Expand Down
5 changes: 5 additions & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ charset-normalizer==3.4.0
click==8.1.7
# via
# black
# jiwer
# nltk
# ray
colorama==0.4.6
Expand Down Expand Up @@ -187,6 +188,8 @@ jinja2==3.1.4
# via
# datamodel-code-generator
# torch
jiwer==3.0.5
# via -r requirements-test.in
jmespath==1.0.1
# via
# boto3
Expand Down Expand Up @@ -470,6 +473,8 @@ pyyaml==6.0.2
# timm
# transformers
# vocos
rapidfuzz==3.12.1
# via jiwer
ray[adag]==2.40.0
# via -r requirements-test.in
redis==5.2.0
Expand Down
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from vllm.platforms import current_platform

from ...utils import RemoteOpenAIServer
from ....utils import RemoteOpenAIServer

MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
NUM_CONCURRENT = 500
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# SPDX-License-Identifier: Apache-2.0
"""
Evaluate Transcription API correctness by computing Word Error Rate (WER)
on a given ASR dataset. When provided, it will also compare the WER against
a baseline.
This simulates real work usage of the API and makes sure that the frontend and
AsyncLLMEngine are working correctly.
"""
import asyncio
import io
import time
from statistics import mean, median
from typing import List

import librosa
import pytest
import soundfile
import torch
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer

from ....utils import RemoteOpenAIServer


def to_bytes(y, sr):
buffer = io.BytesIO()
soundfile.write(buffer, y, sr, format="WAV")
buffer.seek(0)
return buffer


async def transcribe_audio(client, tokenizer, y, sr):
# Send loaded audio directly instead of loading from disk,
# dont account for that time though
with to_bytes(y, sr) as f:
start_time = time.perf_counter()
transcription = await client.audio.transcriptions.create(
file=f,
model=tokenizer.name_or_path,
language="en",
temperature=0.0,
)
end_time = time.perf_counter()
# NOTE there's no streaming in transcriptions, can't measure ttft
latency = end_time - start_time
num_output_tokens = len(
tokenizer(transcription.text, add_special_tokens=False).input_ids)
return latency, num_output_tokens, transcription.text


async def bound_transcribe(model_name, sem, client, audio, reference):
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Use semaphore to limit concurrent requests.
async with sem:
result = await transcribe_audio(client, tokenizer, *audio)
# Normalize *english* output/reference for evaluation.
out = tokenizer.normalize(result[2])
ref = tokenizer.normalize(reference)
return result[:2] + (out, ref)


async def process_dataset(model, client, data, concurrent_request):
sem = asyncio.Semaphore(concurrent_request)

# Warmup call as the first `librosa.load` server-side is quite slow.
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
_ = await bound_transcribe(model, sem, client, (audio, sr), "")

tasks: List[asyncio.Task] = []
for sample in data:
audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
task = asyncio.create_task(
bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
tasks.append(task)
return await asyncio.gather(*tasks)


def print_performance_metrics(results, total_time):
latencies = [res[0] for res in results]
total_tokens = sum([res[1] for res in results])

total = len(results)
print(f"Total Requests: {total}")
print(f"Successful Requests: {len(latencies)}")
print(f"Average Latency: {mean(latencies):.4f} seconds")
print(f"Median Latency: {median(latencies):.4f} seconds")
perc = sorted(latencies)[int(len(latencies) * 0.95) - 1]
print(f"95th Percentile Latency: {perc:.4f} seconds")
# Throughput
req_throughput = len(latencies) / total_time
print(f"Estimated req_Throughput: {req_throughput:.2f} requests/s")
throughput = total_tokens / total_time
print(f"Estimated Throughput: {throughput:.2f} tok/s")


def add_duration(sample):
y, sr = sample['audio']["array"], sample['audio']["sampling_rate"]
sample['duration_ms'] = librosa.get_duration(y=y, sr=sr) * 1000
return sample


def load_hf_dataset(dataset_repo: str, split='validation', **hf_kwargs):
## Load and filter the dataset
dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
if 'duration_ms' not in dataset[0]:
# compute duration to filter
dataset = dataset.map(add_duration)

# Whisper max supported duration
dataset = dataset.filter(lambda example: example['duration_ms'] < 30000)
return dataset


def run_evaluation(model: str,
client,
dataset,
max_concurrent_reqs: int,
n_examples: int = -1,
print_metrics: bool = True):
if n_examples > 0:
dataset = dataset.select(range(n_examples))
start = time.perf_counter()
results = asyncio.run(
process_dataset(model, client, dataset, max_concurrent_reqs))
end = time.perf_counter()
total_time = end - start
print(f"Total Test Time: {total_time:.4f} seconds")
if print_metrics:
print_performance_metrics(results, total_time)
# Compute WER
predictions = [res[2] for res in results]
references = [res[3] for res in results]
wer = load("wer")
wer_score = 100 * wer.compute(references=references,
predictions=predictions)
print("WER:", wer_score)
return wer_score


# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
@pytest.mark.parametrize(
"dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"])
# NOTE: Expected WER measured with equivalent hf.transformers args:
# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
@pytest.mark.parametrize("expected_wer", [12.744980])
def test_wer_correctness(model_name,
dataset_repo,
expected_wer,
n_examples=-1,
max_concurrent_request=None):
with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
dataset = load_hf_dataset(dataset_repo)

if not max_concurrent_request:
# No max concurrency
max_concurrent_request = n_examples if n_examples > 0\
else len(dataset)

client = remote_server.get_async_client()
wer = run_evaluation(model_name, client, dataset,
max_concurrent_request, n_examples)
if expected_wer:
torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
Loading

0 comments on commit d84cef7

Please sign in to comment.