Skip to content

Commit

Permalink
powv wip
Browse files Browse the repository at this point in the history
  • Loading branch information
devmyriade committed Feb 1, 2025
1 parent e3f7ff6 commit 60ffeda
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 6 deletions.
1 change: 1 addition & 0 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,7 @@ def _process_model_outputs(self,
self._process_sequence_group_outputs(seq_group, output)
else:
self.output_processor.process_prompt_logprob(seq_group, output)
seq_group.powv = output[0].powv
if seq_group_meta.do_sample:
self.output_processor.process_outputs(
seq_group, output, is_async)
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
from vllm.version import __version__ as VLLM_VERSION

TIMEOUT_KEEP_ALIVE = 5 # seconds

POWV_VERIFY_VERSION = "2"
prometheus_multiproc_dir: tempfile.TemporaryDirectory

# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,8 @@ class CompletionResponseChoice(OpenAIBaseModel):
"to stop, None if the completion finished for some other reason "
"including encountering the EOS token"),
)
powv: Optional[int] = None
token_ids: Optional[List[int]] = None
prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None


Expand All @@ -1109,6 +1111,8 @@ class CompletionResponseStreamChoice(OpenAIBaseModel):
"to stop, None if the completion finished for some other reason "
"including encountering the EOS token"),
)
powv: Optional[int] = None
token_ids: Optional[List[int]] = None


class CompletionStreamResponse(OpenAIBaseModel):
Expand Down
5 changes: 5 additions & 0 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ async def chat_completion_stream_generator(
content="",
),
logprobs=None,
token_ids=res.prompt_token_ids,
finish_reason=None)
chunk = ChatCompletionStreamResponse(
id=request_id,
Expand Down Expand Up @@ -412,6 +413,8 @@ async def chat_completion_stream_generator(
index=i,
delta=DeltaMessage(
content=last_msg_content),
token_ids=output.token_ids,

Check failure on line 416 in vllm/entrypoints/openai/serving_chat.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (F821)

vllm/entrypoints/openai/serving_chat.py:416:51: F821 Undefined name `output`
powv=output.powv,

Check failure on line 417 in vllm/entrypoints/openai/serving_chat.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (F821)

vllm/entrypoints/openai/serving_chat.py:417:46: F821 Undefined name `output`
logprobs=None,
finish_reason=None))
chunk = ChatCompletionStreamResponse(
Expand Down Expand Up @@ -539,6 +542,8 @@ async def chat_completion_stream_generator(
choice_data = ChatCompletionResponseStreamChoice(
index=i,
delta=delta_message,
token_ids=output.token_ids,
powv=output.powv,
logprobs=logprobs,
finish_reason=None)

Expand Down
2 changes: 2 additions & 0 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,8 @@ async def completion_stream_generator(
logprobs=logprobs,
finish_reason=finish_reason,
stop_reason=stop_reason,
token_ids=delta_token_ids,
powv=output.powv,
)
])
if include_continuous_usage:
Expand Down
1 change: 1 addition & 0 deletions vllm/model_executor/layers/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ class SamplerOutput(
# Time taken in the model execute function. This will include model forward,
# block/sync across workers, cpu-gpu sync time and sampling time.
model_execute_time: Optional[float] = None
powv: Optional[int] = None

def __getitem__(self, idx: int) -> CompletionSequenceGroupOutput:
return self.outputs[idx]
Expand Down
14 changes: 10 additions & 4 deletions vllm/outputs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import time
from dataclasses import dataclass
from typing import Dict, Generic, List, MutableSequence, Optional
from typing import Dict, Generic, List, MutableSequence, Optional, Tuple

Check failure on line 3 in vllm/outputs.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (F401)

vllm/outputs.py:3:68: F401 `typing.Tuple` imported but unused
from typing import Sequence as GenericSequence
from typing import Union

Expand Down Expand Up @@ -41,6 +41,7 @@ class CompletionOutput:
finish_reason: Optional[str] = None
stop_reason: Union[int, str, None] = None
lora_request: Optional[LoRARequest] = None
powv: Optional[int] = None

def finished(self) -> bool:
return self.finish_reason is not None
Expand Down Expand Up @@ -116,6 +117,7 @@ def __init__(
encoder_prompt: Optional[str] = None,
encoder_prompt_token_ids: Optional[List[int]] = None,
num_cached_tokens: Optional[int] = None,
powv: Optional[int] = None,
*,
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
) -> None:
Expand All @@ -131,6 +133,7 @@ def __init__(
self.encoder_prompt = encoder_prompt
self.encoder_prompt_token_ids = encoder_prompt_token_ids
self.num_cached_tokens = num_cached_tokens
self.powv: Optional[int] = powv

@classmethod
def new(
Expand Down Expand Up @@ -283,6 +286,7 @@ def from_seq_group(
output.finish_reason = SequenceStatus.get_finished_reason(
seq.status)
output.stop_reason = seq.stop_reason
output.powv = seq_group.powv

else:
output = CompletionOutput(
Expand All @@ -291,7 +295,7 @@ def from_seq_group(
seq.get_cumulative_logprob() if include_logprobs else None,
output_logprobs,
SequenceStatus.get_finished_reason(seq.status),
seq.stop_reason)
seq.stop_reason, powv=seq_group.powv)

outputs.append(output)

Expand Down Expand Up @@ -364,11 +368,12 @@ class PoolingRequestOutput(Generic[_O]):
"""

def __init__(self, request_id: str, outputs: _O,
prompt_token_ids: List[int], finished: bool):
prompt_token_ids: List[int], finished: bool, powv: Optional[int]=None):
self.request_id = request_id
self.prompt_token_ids = prompt_token_ids
self.finished = finished
self.outputs = outputs
self.powv = powv

@staticmethod
def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
Expand All @@ -381,7 +386,7 @@ def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
finished = seq_group.is_finished()

return PoolingRequestOutput(seq_group.request_id, output,
prompt_token_ids, finished)
prompt_token_ids, finished, seq_group.powv)

def __repr__(self):
"""
Expand Down Expand Up @@ -447,6 +452,7 @@ def from_base(request_output: PoolingRequestOutput):
outputs=EmbeddingOutput.from_base(request_output.outputs),
prompt_token_ids=request_output.prompt_token_ids,
finished=request_output.finished,

)


Expand Down
3 changes: 3 additions & 0 deletions vllm/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,13 +653,15 @@ def __init__(
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
powv: Optional[int]=None
) -> None:
self.request_id = request_id
self.seqs = seqs
self.first_seq = seqs[0]
self.arrival_time = arrival_time
self.is_single_seq = len(seqs) == 1
self.seqs_dict = {seq.seq_id: seq for seq in seqs}
self.powv = powv

self.sampling_params = sampling_params
self.metrics = RequestMetrics(arrival_time=arrival_time,
Expand Down Expand Up @@ -1078,6 +1080,7 @@ class CompletionSequenceGroupOutput(
samples: List[SequenceOutput]
# Prompt logprob for each prompt query token.
prompt_logprobs: Optional[PromptLogprobs]
powv: Optional[int]=None

def __repr__(self) -> str:
return (f"CompletionSequenceGroupOutput(samples={self.samples}, "
Expand Down
50 changes: 49 additions & 1 deletion vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import torch.distributed
import torch.nn as nn
from tqdm import tqdm

from math import floor
import vllm.envs as envs
from vllm.attention import AttentionMetadata, get_attn_backend
from vllm.attention.backends.abstract import AttentionState
Expand Down Expand Up @@ -1083,6 +1083,7 @@ def __init__(

# Lazy initialization
self.model: nn.Module # Set after load_model
self.model_num_params: int
# Set after load_model.
self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None
Expand Down Expand Up @@ -1162,6 +1163,37 @@ def load_model(self) -> None:
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
backend=backend)


def get_powv(
self,
input_tokens,
response_tokens,
) -> int:
"""

Check failure on line 1172 in vllm/worker/model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/worker/model_runner.py:1172:81: E501 Line too long (86 > 80)
Calculates probability of weights value that can be used to verify the outputs
of a model were made with the model claimed.
"""
powv = 0
input_sum = sum(input_tokens)
output_sum = sum(response_tokens)
token_sum = input_sum + output_sum
param_index = token_sum % self.model_num_params
for k, param in enumerate(self.model.parameters()):
if k != param_index:
continue
if param.dim() == 1:
weights = param.tolist()
else:
tensor_index = output_sum % param.size()[0]
weights = param[tensor_index].tolist()
if len(weights) == 0:
param_index += 1
continue
weight_index = input_sum % len(weights)
powv = floor(weights[weight_index] * token_sum)
break
return powv

def get_model(self) -> nn.Module:
return self.model

Expand Down Expand Up @@ -1705,6 +1737,22 @@ def execute_model(
"finished_requests_ids": model_input.finished_requests_ids,
"request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
} if self.has_inner_state else {}


if(model_input.input_positions is not None and model_input.sampling_metadata is not None):
for i, o in enumerate(output.outputs):
seq_id = model_input.sampling_metadata.seq_groups[i].seq_ids[0]
input_tokens = (
model_input.sampling_metadata.seq_groups[i]

Check failure on line 1746 in vllm/worker/model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (F821)

vllm/worker/model_runner.py:1746:35: F821 Undefined name `output`
.seq_data[seq_id]
.get_prompt_token_ids()
)
output_tokens = (
model_input.sampling_metadata.seq_groups[i]
.seq_data[seq_id]
.get_output_token_ids()
)
o.powv = self.get_powv(input_tokens, output_tokens)
if (self.observability_config is not None
and self.observability_config.collect_model_forward_time):
model_forward_start = torch.cuda.Event(enable_timing=True)
Expand Down

0 comments on commit 60ffeda

Please sign in to comment.