diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index f1531e0fc0675..e3f502cdc14e6 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -142,12 +142,11 @@ def __init__(self, self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled or runner.cache_config.enable_prefix_caching) self.model_input_cls = self.runner._model_input_cls - self.attn_backend = self.runner.attn_backend + self.attn_backend: AttentionBackend = self.runner.attn_backend self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper self.input_data = ModelInputForCPUBuilder.ModelInputData( self.runner.model_config.uses_mrope) - self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()( - self) + self.att_metadata_builder = self.attn_backend.get_builder_cls()(self) def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 2b545d1b28bd2..51e1aced2f8c4 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -448,7 +448,7 @@ def __init__(self, self.runner = runner self.model_input_cls = self.runner._model_input_cls - self.attn_backend = self.runner.attn_backend + self.attn_backend: AttentionBackend = self.runner.attn_backend self.scheduler_config = self.runner.scheduler_config self.sliding_window = self.runner.sliding_window self.block_size = self.runner.block_size @@ -1048,14 +1048,14 @@ def __init__( needs_attn_backend = (num_attn_heads != 0 or self.model_config.is_attention_free) - self.attn_backend = get_attn_backend( + self.attn_backend: Optional[AttentionBackend] = get_attn_backend( self.model_config.get_head_size(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, self.model_config.is_attention_free, ) if needs_attn_backend else None - if self.attn_backend: + if self.attn_backend is not None: self.attn_state = self.attn_backend.get_state_cls()( weakref.proxy(self)) else: @@ -1481,6 +1481,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: ) self.set_active_prompt_adapters( set(), prompt_adapter_mapping) + assert isinstance(self.attn_backend, AttentionBackend) graph_runner = CUDAGraphRunner( self.model, self.attn_backend.get_name(), self.attn_state.graph_clone(batch_size), diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index dee63a75c0605..1dac08d4e1c73 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -313,6 +313,8 @@ def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs): supported_attention_backends: List[str] = \ _get_supported_attention_backends( self.scheduler_config.chunked_prefill_enabled) + + assert isinstance(self.attn_backend, AttentionBackend) if self.attn_backend.get_name() not in supported_attention_backends: ms_config_str: str = "Multi-Step + Chunked-Prefill" \ if self.scheduler_config.chunked_prefill_enabled \ diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 0bf522d5333ed..782eca9ccdb6d 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -371,10 +371,14 @@ def execute_model( else: seq_group_metadata_list = execute_model_req.seq_group_metadata_list + blocks_to_copy: List[Tuple[int, int]] + blocks_to_swap_in: List[Tuple[int, int]] + blocks_to_swap_out: List[Tuple[int, int]] + if self.is_driver_worker: assert seq_group_metadata_list is not None num_seq_groups: int = len(seq_group_metadata_list) - assert execute_model_req is not None + assert isinstance(execute_model_req, ExecuteModelRequest) blocks_to_copy = execute_model_req.blocks_to_copy blocks_to_swap_in = execute_model_req.blocks_to_swap_in blocks_to_swap_out = execute_model_req.blocks_to_swap_out @@ -393,8 +397,8 @@ def execute_model( blocks_to_swap_out = data["blocks_to_swap_out"] if current_platform.is_openvino_cpu(): - assert len(execute_model_req.blocks_to_swap_in) == 0 - assert len(execute_model_req.blocks_to_swap_out) == 0 + assert len(blocks_to_swap_in) == 0 + assert len(blocks_to_swap_out) == 0 else: self.cache_swap_in(blocks_to_swap_in) self.cache_swap_out(blocks_to_swap_out)