diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index b40f11ca8396..b15177ce7540 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -500,17 +500,17 @@ def _post_completion(self, response: ModelResponse) -> float: if usage: # keep track of the input and output tokens - input_tokens = usage.get('prompt_tokens') - output_tokens = usage.get('completion_tokens') + prompt_tokens = usage.get('prompt_tokens', 0) + completion_tokens = usage.get('completion_tokens', 0) - if input_tokens: - stats += 'Input tokens: ' + str(input_tokens) + if prompt_tokens: + stats += 'Input tokens: ' + str(prompt_tokens) - if output_tokens: + if completion_tokens: stats += ( - (' | ' if input_tokens else '') + (' | ' if prompt_tokens else '') + 'Output tokens: ' - + str(output_tokens) + + str(completion_tokens) + '\n' ) @@ -519,7 +519,7 @@ def _post_completion(self, response: ModelResponse) -> float: 'prompt_tokens_details' ) cache_hit_tokens = ( - prompt_tokens_details.cached_tokens if prompt_tokens_details else None + prompt_tokens_details.cached_tokens if prompt_tokens_details else 0 ) if cache_hit_tokens: stats += 'Input tokens (cache hit): ' + str(cache_hit_tokens) + '\n' @@ -528,10 +528,19 @@ def _post_completion(self, response: ModelResponse) -> float: # but litellm doesn't separate them in the usage stats # so we can read it from the provider-specific extra field model_extra = usage.get('model_extra', {}) - cache_write_tokens = model_extra.get('cache_creation_input_tokens') + cache_write_tokens = model_extra.get('cache_creation_input_tokens', 0) if cache_write_tokens: stats += 'Input tokens (cache write): ' + str(cache_write_tokens) + '\n' + # Record in metrics + # We'll treat cache_hit_tokens as "cache read" and cache_write_tokens as "cache write" + self.metrics.add_tokens_usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cache_read_tokens=cache_hit_tokens, + cache_write_tokens=cache_write_tokens, + ) + # log the stats if stats: logger.debug(stats) diff --git a/openhands/llm/metrics.py b/openhands/llm/metrics.py index a010bb26916d..a927580fbe60 100644 --- a/openhands/llm/metrics.py +++ b/openhands/llm/metrics.py @@ -17,11 +17,23 @@ class ResponseLatency(BaseModel): response_id: str +class TokensUsage(BaseModel): + """Metric tracking detailed token usage per completion call.""" + + model: str + prompt_tokens: int + completion_tokens: int + cache_read_tokens: int + cache_write_tokens: int + timestamp: float = Field(default_factory=time.time) + + class Metrics: """Metrics class can record various metrics during running and evaluation. Currently, we define the following metrics: accumulated_cost: the total cost (USD $) of the current LLM. response_latency: the time taken for each LLM completion call. + accrued token usage: the total tokens used across all completions. """ def __init__(self, model_name: str = 'default') -> None: @@ -29,6 +41,11 @@ def __init__(self, model_name: str = 'default') -> None: self._costs: list[Cost] = [] self._response_latencies: list[ResponseLatency] = [] self.model_name = model_name + self._accumulated_prompt_tokens = 0 + self._accumulated_completion_tokens = 0 + self._accumulated_cache_read_tokens = 0 + self._accumulated_cache_write_tokens = 0 + self._tokens_usages: list[TokensUsage] = [] @property def accumulated_cost(self) -> float: @@ -67,16 +84,50 @@ def add_response_latency(self, value: float, response_id: str) -> None: ) ) + def add_tokens_usage( + self, + prompt_tokens: int, + completion_tokens: int, + cache_read_tokens: int, + cache_write_tokens: int, + ) -> None: + # accumulate + self._accumulated_prompt_tokens += prompt_tokens + self._accumulated_completion_tokens += completion_tokens + self._accumulated_cache_read_tokens += cache_read_tokens + self._accumulated_cache_write_tokens += cache_write_tokens + + # record this individual usage + self._tokens_usages.append( + TokensUsage( + model=self.model_name, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cache_read_tokens=cache_read_tokens, + cache_write_tokens=cache_write_tokens, + ) + ) + def merge(self, other: 'Metrics') -> None: self._accumulated_cost += other.accumulated_cost self._costs += other._costs self._response_latencies += other._response_latencies + self._accumulated_prompt_tokens += other._accumulated_prompt_tokens + self._accumulated_completion_tokens += other._accumulated_completion_tokens + self._accumulated_cache_read_tokens += other._accumulated_cache_read_tokens + self._accumulated_cache_write_tokens += other._accumulated_cache_write_tokens + self._tokens_usages += other._tokens_usages def get(self) -> dict: """Return the metrics in a dictionary.""" return { 'accumulated_cost': self._accumulated_cost, 'costs': [cost.model_dump() for cost in self._costs], + 'accumulated_prompt_tokens': self._accumulated_prompt_tokens, + 'accumulated_completion_tokens': self._accumulated_completion_tokens, + 'accumulated_cache_read_tokens': self._accumulated_cache_read_tokens, + 'accumulated_cache_write_tokens': self._accumulated_cache_write_tokens, + 'tokens_usages': [usage.model_dump() for usage in self._tokens_usages], 'response_latencies': [ latency.model_dump() for latency in self._response_latencies ], @@ -86,6 +137,11 @@ def reset(self): self._accumulated_cost = 0.0 self._costs = [] self._response_latencies = [] + self._accumulated_prompt_tokens = 0 + self._accumulated_completion_tokens = 0 + self._accumulated_cache_read_tokens = 0 + self._accumulated_cache_write_tokens = 0 + self._tokens_usages = [] def log(self): """Log the metrics."""