diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py index f0a295ace..84318de1a 100644 --- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py +++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py @@ -59,6 +59,22 @@ class VllmRbProperties(Properties): enable_prefix_caching: Optional[bool] = False disable_sliding_window: Optional[bool] = False limit_mm_per_prompt: Optional[Mapping[str, int]] = None + use_v2_block_manager: bool = False + + # Speculative decoding configuration. + speculative_model: Optional[str] = None + speculative_model_quantization: Optional[str] = None + speculative_draft_tensor_parallel_size: Optional[int] = None + num_speculative_tokens: Optional[int] = None + speculative_max_model_len: Optional[int] = None + speculative_disable_by_batch_size: Optional[int] = None + ngram_prompt_lookup_max: Optional[int] = None + ngram_prompt_lookup_min: Optional[int] = None + spec_decoding_acceptance_method: str = 'rejection_sampler' + typical_acceptance_sampler_posterior_threshold: Optional[float] = None + typical_acceptance_sampler_posterior_alpha: Optional[float] = None + qlora_adapter_name_or_path: Optional[str] = None + disable_logprobs_during_spec_decoding: Optional[bool] = None @field_validator('engine') def validate_engine(cls, engine): diff --git a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py index 0da4e322b..d0a250283 100644 --- a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py +++ b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py @@ -266,6 +266,27 @@ def get_engine_args_from_config(config: VllmRbProperties) -> EngineArgs: enable_prefix_caching=config.enable_prefix_caching, disable_sliding_window=config.disable_sliding_window, max_num_seqs=config.max_rolling_batch_size, + use_v2_block_manager=config.use_v2_block_manager, + speculative_model=config.speculative_model, + speculative_model_quantization=config. + speculative_model_quantization, + speculative_draft_tensor_parallel_size=config. + speculative_draft_tensor_parallel_size, + num_speculative_tokens=config.num_speculative_tokens, + speculative_max_model_len=config.speculative_max_model_len, + speculative_disable_by_batch_size=config. + speculative_disable_by_batch_size, + ngram_prompt_lookup_max=config.ngram_prompt_lookup_max, + ngram_prompt_lookup_min=config.ngram_prompt_lookup_min, + spec_decoding_acceptance_method=config. + spec_decoding_acceptance_method, + typical_acceptance_sampler_posterior_threshold=config. + typical_acceptance_sampler_posterior_threshold, + typical_acceptance_sampler_posterior_alpha=config. + typical_acceptance_sampler_posterior_alpha, + qlora_adapter_name_or_path=config.qlora_adapter_name_or_path, + disable_logprobs_during_spec_decoding=config. + disable_logprobs_during_spec_decoding, ) diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 2216225c8..eb14b23a8 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -463,6 +463,18 @@ def get_model_name(): "seq_length": [256], "tokenizer": "tiiuae/falcon-11B" }, + "llama-68m-speculative-medusa": { + "max_memory_per_gpu": [25.0], + "batch_size": [1, 4], + "seq_length": [256], + "tokenizer": "JackFram/llama-68m" + }, + "llama-68m-speculative-eagle": { + "max_memory_per_gpu": [25.0], + "batch_size": [1, 4], + "seq_length": [256], + "tokenizer": "JackFram/llama-68m" + }, "llama-7b-unmerged-lora": { "max_memory_per_gpu": [15.0, 15.0], "batch_size": [3], diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index 656946ea7..c07a1c38a 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -625,6 +625,24 @@ "option.tensor_parallel_degree": 4, "option.enable_chunked_prefill": "true", }, + "llama-68m-speculative-medusa": { + "option.model_id": "s3://djl-llm/llama-68m/", + "option.task": "text-generation", + "option.speculative_model": "s3://djl-llm/llama-2-tiny/", + "option.num_speculative_tokens": 4, + "option.use_v2_block_manager": True, + "option.tensor_parallel_degree": 1, + "option.max_rolling_batch_size": 4, + }, + "llama-68m-speculative-eagle": { + "option.model_id": "s3://djl-llm/llama-68m/", + "option.task": "text-generation", + "option.speculative_model": "abhigoyal/vllm-eagle-llama-68m-random", + "option.num_speculative_tokens": 4, + "option.use_v2_block_manager": True, + "option.tensor_parallel_degree": 1, + "option.max_rolling_batch_size": 4, + }, "llama-7b-unmerged-lora": { "option.model_id": "s3://djl-llm/huggyllama-llama-7b", "option.tensor_parallel_degree": "max", diff --git a/tests/integration/tests.py b/tests/integration/tests.py index 1f085ce38..ae9e73e24 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -604,6 +604,18 @@ def test_falcon_11b_chunked_prefill(self): client.run( "vllm falcon-11b-chunked-prefill --in_tokens 1200".split()) + def test_llama_68m_speculative_medusa(self): + with Runner('lmi', 'llama-68m-speculative-medusa') as r: + prepare.build_vllm_model("llama-68m-speculative-medusa") + r.launch() + client.run("vllm llama-68m-speculative-medusa".split()) + + def test_llama_68m_speculative_eagle(self): + with Runner('lmi', 'llama-68m-speculative-eagle') as r: + prepare.build_vllm_model("llama-68m-speculative-eagle") + r.launch() + client.run("vllm llama-68m-speculative-eagle".split()) + @pytest.mark.vllm @pytest.mark.lora