diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index e19ace782feb5..2ead1f51ed81e 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -30,7 +30,7 @@ function cpu_tests() { # offline inference docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " set -e - python3 examples/offline_inference/basic.py" + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" # Run basic model test docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 99972afa21d1e..20aca328ba135 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -24,5 +24,5 @@ remove_docker_container # Run the image and test offline inference docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B + python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B ' diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh index 1edcb1d2669e9..f83eb927aae4e 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/run-hpu-test.sh @@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT remove_docker_container # Run the image and launch offline inference -docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m EXITCODE=$? diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 6159b21ff8206..a1103bed66ecb 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index 4d344e58db8ac..d48639e5720c5 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -14,6 +14,6 @@ remove_docker_container # Run the image and test offline inference/tensor parallel docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' - python3 examples/offline_inference/basic.py - python3 examples/offline_inference/cli.py -tp 2 + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9d05ff4c2cfdd..66efe3ed32986 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -215,18 +215,18 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference/basic.py - - python3 offline_inference/cpu_offload.py - - python3 offline_inference/chat.py + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - python3 offline_inference/vision_language.py - python3 offline_inference/vision_language_multi_image.py - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py - - python3 offline_inference/classification.py - - python3 offline_inference/embedding.py - - python3 offline_inference/scoring.py + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index 9d4de18a3b79d..c5f75953aaf24 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -147,7 +147,7 @@ def generate(self) -> str: return content content += "## Example materials\n\n" - for file in self.other_files: + for file in sorted(self.other_files): include = "include" if file.suffix == ".md" else "literalinclude" content += f":::{{admonition}} {file.relative_to(self.path)}\n" content += ":class: dropdown\n\n" @@ -194,7 +194,7 @@ def generate_examples(): path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md", title="Offline Inference", description= - "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501 + "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with .", # noqa: E501 caption="Examples", ), } diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md index d53430403583c..9c5977939cc56 100644 --- a/docs/source/getting_started/installation/cpu/index.md +++ b/docs/source/getting_started/installation/cpu/index.md @@ -170,7 +170,7 @@ vLLM CPU backend supports the following vLLM features: sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library find / -name *libtcmalloc* # find the dynamic link library path export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -python examples/offline_inference/basic.py # run vLLM +python examples/offline_inference/basic/basic.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: @@ -207,7 +207,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 $ export VLLM_CPU_OMP_THREADS_BIND=0-7 -$ python examples/offline_inference/basic.py +$ python examples/offline_inference/basic/basic.py ``` - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index f4682ee45a48e..f3a4773f0fc6c 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 4abe6b776eea3..f31e5715d1754 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -46,7 +46,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: ### `LLM.beam_search` @@ -103,7 +103,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 764b67241999e..8612935432b89 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -88,7 +88,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.classify` @@ -103,7 +103,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.score` @@ -125,7 +125,7 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found here: +A code example can be found here: ## Online Serving diff --git a/examples/offline_inference/aqlm_example.py b/examples/offline_inference/aqlm_example.py deleted file mode 100644 index e8db3811ff171..0000000000000 --- a/examples/offline_inference/aqlm_example.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM, SamplingParams -from vllm.utils import FlexibleArgumentParser - - -def main(): - - parser = FlexibleArgumentParser(description='AQLM examples') - - parser.add_argument('--model', - '-m', - type=str, - default=None, - help='model path, as for HF') - parser.add_argument('--choice', - '-c', - type=int, - default=0, - help='known good models by index, [0-4]') - parser.add_argument('--tensor-parallel-size', - '-t', - type=int, - default=1, - help='tensor parallel size') - - args = parser.parse_args() - - models = [ - "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", - "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf", - "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf", - "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf", - "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf", - ] - - model = LLM(args.model if args.model is not None else models[args.choice], - tensor_parallel_size=args.tensor_parallel_size) - - sampling_params = SamplingParams(max_tokens=100, temperature=0) - outputs = model.generate("Hello my name is", - sampling_params=sampling_params) - print(outputs[0].outputs[0].text) - - -if __name__ == '__main__': - main() diff --git a/examples/offline_inference/arctic.py b/examples/offline_inference/arctic.py deleted file mode 100644 index 90c88446c5146..0000000000000 --- a/examples/offline_inference/arctic.py +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Create an LLM. -llm = LLM(model="snowflake/snowflake-arctic-instruct", - quantization="deepspeedfp", - tensor_parallel_size=8, - trust_remote_code=True) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. - -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference/basic/README.md b/examples/offline_inference/basic/README.md new file mode 100644 index 0000000000000..5cb0177b355df --- /dev/null +++ b/examples/offline_inference/basic/README.md @@ -0,0 +1,94 @@ +# Basic + +The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server. + +## Usage + +The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here. + +```bash +python examples/offline_inference/basic/basic.py +``` + +The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments. + +```bash +python examples/offline_inference/basic/classify.py +``` + +```bash +python examples/offline_inference/basic/embed.py +``` + +```bash +python examples/offline_inference/basic/score.py +``` + +The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`. + +```bash +python examples/offline_inference/basic/chat.py +``` + +```bash +python examples/offline_inference/basic/generate.py +``` + +## Features + +In the scripts that support passing arguments, you can experiment with the following features. + +### Default generation config + +The `--generation-config` argument specifies where the generation config will be loaded from when calling `LLM.get_default_sampling_params()`. If set to ‘auto’, the generation config will be loaded from model path. If set to a folder path, the generation config will be loaded from the specified folder path. If it is not provided, vLLM defaults will be used. + +> If max_new_tokens is specified in generation config, then it sets a server-wide limit on the number of output tokens for all requests. + +Try it yourself with the following argument: + +```bash +--generation-config auto +``` + +### Quantization + +#### AQLM + +vLLM supports models that are quantized using AQLM. + +Try one yourself by passing one of the following models to the `--model` argument: + +- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf` +- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf` +- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf` +- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf` +- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf` + +> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs. + +#### GGUF + +vLLM supports models that are quantized using GGUF. + +Try one yourself by downloading a GUFF quantised model and using the following arguments: + +```python +from huggingface_hub import hf_hub_download +repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF" +filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf" +print(hf_hub_download(repo_id, filename=filename)) +``` + +```bash +--model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct +``` + +### CPU offload + +The `--cpu-offload-gb` argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass. + +Try it yourself with the following arguments: + +```bash +--model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 +``` diff --git a/examples/offline_inference/basic.py b/examples/offline_inference/basic/basic.py similarity index 100% rename from examples/offline_inference/basic.py rename to examples/offline_inference/basic/basic.py diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py new file mode 100644 index 0000000000000..b2523e533a40a --- /dev/null +++ b/examples/offline_inference/basic/chat.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def main(args: dict): + # Pop arguments not used by LLM + max_tokens = args.pop("max_tokens") + temperature = args.pop("temperature") + top_p = args.pop("top_p") + top_k = args.pop("top_k") + chat_template_path = args.pop("chat_template_path") + + # Create an LLM + llm = LLM(**args) + + # Create sampling params object + sampling_params = llm.get_default_sampling_params() + if max_tokens is not None: + sampling_params.max_tokens = max_tokens + if temperature is not None: + sampling_params.temperature = temperature + if top_p is not None: + sampling_params.top_p = top_p + if top_k is not None: + sampling_params.top_k = top_k + + def print_outputs(outputs): + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Generated text: {generated_text!r}") + print("-" * 80) + + print("=" * 80) + + # In this script, we demonstrate how to pass input to the chat method: + conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": + "Write an essay about the importance of higher education.", + }, + ] + outputs = llm.chat(conversation, sampling_params, use_tqdm=False) + print_outputs(outputs) + + # You can run batch inference with llm.chat API + conversations = [conversation for _ in range(10)] + + # We turn on tqdm progress bar to verify it's indeed running batch inference + outputs = llm.chat(conversations, sampling_params, use_tqdm=True) + print_outputs(outputs) + + # A chat template can be optionally supplied. + # If not, the model will use its default chat template. + if chat_template_path is not None: + with open(chat_template_path) as f: + chat_template = f.read() + + outputs = llm.chat( + conversations, + sampling_params, + use_tqdm=False, + chat_template=chat_template, + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + # Add engine args + engine_group = parser.add_argument_group("Engine arguments") + EngineArgs.add_cli_args(engine_group) + engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") + # Add sampling params + sampling_group = parser.add_argument_group("Sampling parameters") + sampling_group.add_argument("--max-tokens", type=int) + sampling_group.add_argument("--temperature", type=float) + sampling_group.add_argument("--top-p", type=float) + sampling_group.add_argument("--top-k", type=int) + # Add example params + parser.add_argument("--chat-template-path", type=str) + args: dict = vars(parser.parse_args()) + main(args) diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py new file mode 100644 index 0000000000000..4ef949b4784de --- /dev/null +++ b/examples/offline_inference/basic/classify.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + # Create an LLM. + # You should pass task="classify" for classification models + model = LLM(**vars(args)) + + # Generate logits. The output is a list of ClassificationRequestOutputs. + outputs = model.classify(prompts) + + # Print the outputs. + for prompt, output in zip(prompts, outputs): + probs = output.outputs.probs + probs_trimmed = ((str(probs[:16])[:-1] + + ", ...]") if len(probs) > 16 else probs) + print(f"Prompt: {prompt!r} | " + f"Class Probabilities: {probs_trimmed} (size={len(probs)})") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach", + task="classify", + enforce_eager=True) + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py new file mode 100644 index 0000000000000..f1655b6dbe111 --- /dev/null +++ b/examples/offline_inference/basic/embed.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + # Create an LLM. + # You should pass task="embed" for embedding models + model = LLM(**vars(args)) + + # Generate embedding. The output is a list of EmbeddingRequestOutputs. + outputs = model.embed(prompts) + + # Print the outputs. + for prompt, output in zip(prompts, outputs): + embeds = output.outputs.embedding + embeds_trimmed = ((str(embeds[:16])[:-1] + + ", ...]") if len(embeds) > 16 else embeds) + print(f"Prompt: {prompt!r} | " + f"Embeddings: {embeds_trimmed} (size={len(embeds)})") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults(model="intfloat/e5-mistral-7b-instruct", + task="embed", + enforce_eager=True) + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py new file mode 100644 index 0000000000000..93f4f2a36fac6 --- /dev/null +++ b/examples/offline_inference/basic/generate.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def main(args: dict): + # Pop arguments not used by LLM + max_tokens = args.pop("max_tokens") + temperature = args.pop("temperature") + top_p = args.pop("top_p") + top_k = args.pop("top_k") + + # Create an LLM + llm = LLM(**args) + + # Create a sampling params object + sampling_params = llm.get_default_sampling_params() + if max_tokens is not None: + sampling_params.max_tokens = max_tokens + if temperature is not None: + sampling_params.temperature = temperature + if top_p is not None: + sampling_params.top_p = top_p + if top_k is not None: + sampling_params.top_k = top_k + + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + # Add engine args + engine_group = parser.add_argument_group("Engine arguments") + EngineArgs.add_cli_args(engine_group) + engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") + # Add sampling params + sampling_group = parser.add_argument_group("Sampling parameters") + sampling_group.add_argument("--max-tokens", type=int) + sampling_group.add_argument("--temperature", type=float) + sampling_group.add_argument("--top-p", type=float) + sampling_group.add_argument("--top-k", type=int) + args: dict = vars(parser.parse_args()) + main(args) diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py new file mode 100644 index 0000000000000..2d21f1f0e3971 --- /dev/null +++ b/examples/offline_inference/basic/score.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def main(args: Namespace): + # Sample prompts. + text_1 = "What is the capital of France?" + texts_2 = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + ] + + # Create an LLM. + # You should pass task="score" for cross-encoder models + model = LLM(**vars(args)) + + # Generate scores. The output is a list of ScoringRequestOutputs. + outputs = model.score(text_1, texts_2) + + # Print the outputs. + for text_2, output in zip(texts_2, outputs): + score = output.outputs.score + print(f"Pair: {[text_1, text_2]!r} | Score: {score}") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults(model="BAAI/bge-reranker-v2-m3", + task="score", + enforce_eager=True) + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference/basic_with_model_default_sampling.py b/examples/offline_inference/basic_with_model_default_sampling.py deleted file mode 100644 index 80de9428f6a9a..0000000000000 --- a/examples/offline_inference/basic_with_model_default_sampling.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -# Create an LLM with built-in default generation config. -# The generation config is set to None by default to keep -# the behavior consistent with the previous version. -# If you want to use the default generation config from the model, -# you should set the generation_config to "auto". -llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto") - -# Load the default sampling parameters from the model. -sampling_params = llm.get_default_sampling_params() -# Modify the sampling parameters if needed. -sampling_params.temperature = 0.5 - -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference/chat.py b/examples/offline_inference/chat.py deleted file mode 100644 index dbc710cc8a0b7..0000000000000 --- a/examples/offline_inference/chat.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM, SamplingParams - -llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") -sampling_params = SamplingParams(temperature=0.5) - - -def print_outputs(outputs): - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - print("-" * 80) - - -print("=" * 80) - -# In this script, we demonstrate how to pass input to the chat method: - -conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, -] -outputs = llm.chat(conversation, - sampling_params=sampling_params, - use_tqdm=False) -print_outputs(outputs) - -# You can run batch inference with llm.chat API -conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, -] -conversations = [conversation for _ in range(10)] - -# We turn on tqdm progress bar to verify it's indeed running batch inference -outputs = llm.chat(messages=conversations, - sampling_params=sampling_params, - use_tqdm=True) -print_outputs(outputs) - -# A chat template can be optionally supplied. -# If not, the model will use its default chat template. - -# with open('template_falcon_180b.jinja', "r") as f: -# chat_template = f.read() - -# outputs = llm.chat( -# conversations, -# sampling_params=sampling_params, -# use_tqdm=False, -# chat_template=chat_template, -# ) diff --git a/examples/offline_inference/classification.py b/examples/offline_inference/classification.py deleted file mode 100644 index 4a364aeb8c47b..0000000000000 --- a/examples/offline_inference/classification.py +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -# Create an LLM. -# You should pass task="classify" for classification models -model = LLM( - model="jason9693/Qwen2.5-1.5B-apeach", - task="classify", - enforce_eager=True, -) - -# Generate logits. The output is a list of ClassificationRequestOutputs. -outputs = model.classify(prompts) - -# Print the outputs. -for prompt, output in zip(prompts, outputs): - probs = output.outputs.probs - probs_trimmed = ((str(probs[:16])[:-1] + - ", ...]") if len(probs) > 16 else probs) - print(f"Prompt: {prompt!r} | " - f"Class Probabilities: {probs_trimmed} (size={len(probs)})") diff --git a/examples/offline_inference/cli.py b/examples/offline_inference/cli.py deleted file mode 100644 index bc6833b3f39c5..0000000000000 --- a/examples/offline_inference/cli.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from dataclasses import asdict - -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser - - -def get_prompts(num_prompts: int): - # The default sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - if num_prompts != len(prompts): - prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts] - - return prompts - - -def main(args): - # Create prompts - prompts = get_prompts(args.num_prompts) - - # Create a sampling params object. - sampling_params = SamplingParams(n=args.n, - temperature=args.temperature, - top_p=args.top_p, - top_k=args.top_k, - max_tokens=args.max_tokens) - - # Create an LLM. - # The default model is 'facebook/opt-125m' - engine_args = EngineArgs.from_cli_args(args) - llm = LLM(**asdict(engine_args)) - - # Generate texts from the prompts. - # The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -if __name__ == '__main__': - parser = FlexibleArgumentParser() - parser = EngineArgs.add_cli_args(parser) - group = parser.add_argument_group("SamplingParams options") - group.add_argument("--num-prompts", - type=int, - default=4, - help="Number of prompts used for inference") - group.add_argument("--max-tokens", - type=int, - default=16, - help="Generated output length for sampling") - group.add_argument('--n', - type=int, - default=1, - help='Number of generated sequences per prompt') - group.add_argument('--temperature', - type=float, - default=0.8, - help='Temperature for text generation') - group.add_argument('--top-p', - type=float, - default=0.95, - help='top_p for text generation') - group.add_argument('--top-k', - type=int, - default=-1, - help='top_k for text generation') - - args = parser.parse_args() - main(args) diff --git a/examples/offline_inference/cpu_offload.py b/examples/offline_inference/cpu_offload.py deleted file mode 100644 index 5511eb738778a..0000000000000 --- a/examples/offline_inference/cpu_offload.py +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Create an LLM. -llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference/embedding.py b/examples/offline_inference/embedding.py deleted file mode 100644 index f9399329d24f3..0000000000000 --- a/examples/offline_inference/embedding.py +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -# Create an LLM. -# You should pass task="embed" for embedding models -model = LLM( - model="intfloat/e5-mistral-7b-instruct", - task="embed", - enforce_eager=True, -) - -# Generate embedding. The output is a list of EmbeddingRequestOutputs. -outputs = model.embed(prompts) - -# Print the outputs. -for prompt, output in zip(prompts, outputs): - embeds = output.outputs.embedding - embeds_trimmed = ((str(embeds[:16])[:-1] + - ", ...]") if len(embeds) > 16 else embeds) - print(f"Prompt: {prompt!r} | " - f"Embeddings: {embeds_trimmed} (size={len(embeds)})") diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py deleted file mode 100644 index 0447e74e0d6f6..0000000000000 --- a/examples/offline_inference/gguf_inference.py +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from huggingface_hub import hf_hub_download - -from vllm import LLM, SamplingParams - - -def run_gguf_inference(model_path, tokenizer): - # Sample prompts. - prompts = [ - "How many helicopters can a human eat in one sitting?", - "What's the future of AI?", - ] - prompts = [[{"role": "user", "content": prompt}] for prompt in prompts] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0, max_tokens=128) - - # Create an LLM. - llm = LLM(model=model_path, tokenizer=tokenizer) - - outputs = llm.chat(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -if __name__ == "__main__": - repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF" - filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf" - tokenizer = "microsoft/Phi-3-medium-4k-instruct" - model = hf_hub_download(repo_id, filename=filename) - run_gguf_inference(model, tokenizer) diff --git a/examples/offline_inference/scoring.py b/examples/offline_inference/scoring.py deleted file mode 100644 index 7daa82b827727..0000000000000 --- a/examples/offline_inference/scoring.py +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from vllm import LLM - -# Sample prompts. -text_1 = "What is the capital of France?" -texts_2 = [ - "The capital of Brazil is Brasilia.", "The capital of France is Paris." -] - -# Create an LLM. -# You should pass task="score" for cross-encoder models -model = LLM( - model="BAAI/bge-reranker-v2-m3", - task="score", - enforce_eager=True, -) - -# Generate scores. The output is a list of ScoringRequestOutputs. -outputs = model.score(text_1, texts_2) - -# Print the outputs. -for text_2, output in zip(texts_2, outputs): - score = output.outputs.score - print(f"Pair: {[text_1, text_2]!r} | Score: {score}") diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index ed50fe5350149..3be248f5aca45 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -14,7 +14,7 @@ def test_platform_plugins(): import os example_file = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(current_file))), - "examples", "offline_inference/basic.py") + "examples", "offline_inference/basic/basic.py") runpy.run_path(example_file) # check if the plugin is loaded correctly