3
3
import dataclasses
4
4
import os
5
5
import time
6
- from abc import ABC , abstractmethod
6
+ from abc import abstractmethod
7
7
from typing import Any , Dict , List , Optional , Set , Tuple , Type , Union
8
8
9
9
import cloudpickle
19
19
from vllm .sequence import ExecuteModelRequest , IntermediateTensors
20
20
from vllm .utils import (enable_trace_function_call_for_thread ,
21
21
resolve_obj_by_qualname , run_method ,
22
- update_environment_variables )
22
+ update_environment_variables ,
23
+ warn_for_unimplemented_methods )
23
24
from vllm .worker .model_runner_base import (BroadcastableModelInput ,
24
25
ModelRunnerBase ,
25
26
ModelRunnerInputBase )
26
27
27
28
logger = init_logger (__name__ )
28
29
29
30
30
- class WorkerBase (ABC ):
31
+ @warn_for_unimplemented_methods
32
+ class WorkerBase :
31
33
"""Worker interface that allows vLLM to cleanly separate implementations for
32
34
different hardware. Also abstracts control plane communication, e.g., to
33
35
communicate request metadata to other workers.
@@ -53,35 +55,31 @@ def __init__(
53
55
from vllm .platforms import current_platform
54
56
self .current_platform = current_platform
55
57
56
- @abstractmethod
57
58
def init_device (self ) -> None :
58
59
"""Initialize device state, such as loading the model or other on-device
59
60
memory allocations.
60
61
"""
61
62
raise NotImplementedError
62
63
63
- @abstractmethod
64
- def determine_num_available_blocks (self ) -> Tuple [int , int ]:
65
- """Determine the number of available blocks for the GPU KV cache and
66
- swappable CPU KV cache.
67
-
68
- The implementation may run profiling or other heuristics to determine
69
- the size of caches.
70
-
71
- Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
72
- are blocks that are "active" on the device and can be appended to.
73
- num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
74
- appended to.
75
- """
76
- raise NotImplementedError
77
-
78
- @abstractmethod
79
64
def initialize_cache (self , num_gpu_blocks : int ,
80
65
num_cpu_blocks : int ) -> None :
81
66
"""Initialize the KV cache with the given size in blocks.
82
67
"""
83
68
raise NotImplementedError
84
69
70
+ def get_model (self ) -> nn .Module :
71
+ raise NotImplementedError
72
+
73
+ def load_model (self ) -> None :
74
+ """Load model onto target device."""
75
+ raise NotImplementedError
76
+
77
+ def execute_model (
78
+ self ,
79
+ execute_model_req : Optional [ExecuteModelRequest ] = None
80
+ ) -> Optional [List [SamplerOutput ]]:
81
+ raise NotImplementedError
82
+
85
83
def start_worker_execution_loop (self ) -> None :
86
84
"""Execute model loop in parallel worker.
87
85
@@ -94,40 +92,43 @@ def start_worker_execution_loop(self) -> None:
94
92
if output is None :
95
93
return None
96
94
97
- @ abstractmethod
98
- def get_model ( self ) -> nn . Module :
99
- raise NotImplementedError
95
+ def determine_num_available_blocks ( self ) -> Tuple [ int , int ]:
96
+ """Determine the number of available blocks for the GPU KV cache and
97
+ swappable CPU KV cache.
100
98
101
- @abstractmethod
102
- def execute_model (
103
- self ,
104
- execute_model_req : Optional [ExecuteModelRequest ] = None
105
- ) -> Optional [List [SamplerOutput ]]:
99
+ The implementation may run profiling or other heuristics to determine
100
+ the size of caches.
101
+
102
+ Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
103
+ are blocks that are "active" on the device and can be appended to.
104
+ num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
105
+ appended to.
106
+ """
106
107
raise NotImplementedError
107
108
108
- @abstractmethod
109
109
def get_cache_block_size_bytes (self ) -> int :
110
110
"""Return the size of a single cache block, in bytes. Used in
111
111
speculative decoding.
112
112
"""
113
113
raise NotImplementedError
114
114
115
- @abstractmethod
116
115
def add_lora (self , lora_request : LoRARequest ) -> bool :
117
116
raise NotImplementedError
118
117
119
- @abstractmethod
120
118
def remove_lora (self , lora_id : int ) -> bool :
121
119
raise NotImplementedError
122
120
123
- @abstractmethod
124
121
def pin_lora (self , lora_id : int ) -> bool :
125
122
raise NotImplementedError
126
123
127
- @abstractmethod
128
124
def list_loras (self ) -> Set [int ]:
129
125
raise NotImplementedError
130
126
127
+ @property
128
+ def vocab_size (self ) -> int :
129
+ """Get vocabulary size from model configuration."""
130
+ return self .model_config .get_vocab_size ()
131
+
131
132
132
133
class DelegateWorkerBase (WorkerBase ):
133
134
"""
@@ -156,6 +157,10 @@ def initialize_cache(self, num_gpu_blocks: int,
156
157
num_cpu_blocks : int ) -> None :
157
158
self .worker .initialize_cache (num_gpu_blocks , num_cpu_blocks )
158
159
160
+ def load_model (self ) -> None :
161
+ """Load model onto target device."""
162
+ self .worker .load_model ()
163
+
159
164
def get_model (self ) -> nn .Module :
160
165
return self .worker .get_model ()
161
166
0 commit comments