diff --git a/openhands/llm/metrics.py b/openhands/llm/metrics.py index e4f52b1d3f31..1863bed86660 100644 --- a/openhands/llm/metrics.py +++ b/openhands/llm/metrics.py @@ -30,10 +30,10 @@ class TokensUsage(BaseModel): class Metrics: """Metrics class can record various metrics during running and evaluation. - Currently, we define the following metrics: - accumulated_cost: the total cost (USD $) of the current LLM. - response_latency: the time taken for each LLM completion call. - accrued token usage: the total tokens used across all completions. + We track: + - accumulated_cost and costs + - A list of ResponseLatency + - A list of TokensUsage (one per call). """ def __init__(self, model_name: str = 'default') -> None: @@ -41,10 +41,6 @@ def __init__(self, model_name: str = 'default') -> None: self._costs: list[Cost] = [] self._response_latencies: list[ResponseLatency] = [] self.model_name = model_name - self._accumulated_prompt_tokens = 0 - self._accumulated_completion_tokens = 0 - self._accumulated_cache_read_tokens = 0 - self._accumulated_cache_write_tokens = 0 self._tokens_usages: list[TokensUsage] = [] @property @@ -92,13 +88,7 @@ def add_tokens_usage( cache_write_tokens: int, response_id: str, ) -> None: - # accumulate - self._accumulated_prompt_tokens += prompt_tokens - self._accumulated_completion_tokens += completion_tokens - self._accumulated_cache_read_tokens += cache_read_tokens - self._accumulated_cache_write_tokens += cache_write_tokens - - # record this individual usage + """Add a single usage record.""" self._tokens_usages.append( TokensUsage( model=self.model_name, @@ -111,13 +101,10 @@ def add_tokens_usage( ) def merge(self, other: 'Metrics') -> None: + """Merge 'other' metrics into this one.""" self._accumulated_cost += other.accumulated_cost self._costs += other._costs self._response_latencies += other._response_latencies - self._accumulated_prompt_tokens += other._accumulated_prompt_tokens - self._accumulated_completion_tokens += other._accumulated_completion_tokens - self._accumulated_cache_read_tokens += other._accumulated_cache_read_tokens - self._accumulated_cache_write_tokens += other._accumulated_cache_write_tokens self._tokens_usages += other._tokens_usages def get(self) -> dict: @@ -125,24 +112,16 @@ def get(self) -> dict: return { 'accumulated_cost': self._accumulated_cost, 'costs': [cost.model_dump() for cost in self._costs], - 'accumulated_prompt_tokens': self._accumulated_prompt_tokens, - 'accumulated_completion_tokens': self._accumulated_completion_tokens, - 'accumulated_cache_read_tokens': self._accumulated_cache_read_tokens, - 'accumulated_cache_write_tokens': self._accumulated_cache_write_tokens, - 'tokens_usages': [usage.model_dump() for usage in self._tokens_usages], 'response_latencies': [ latency.model_dump() for latency in self._response_latencies ], + 'tokens_usages': [usage.model_dump() for usage in self._tokens_usages], } def reset(self): self._accumulated_cost = 0.0 self._costs = [] self._response_latencies = [] - self._accumulated_prompt_tokens = 0 - self._accumulated_completion_tokens = 0 - self._accumulated_cache_read_tokens = 0 - self._accumulated_cache_write_tokens = 0 self._tokens_usages = [] def log(self): diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py index 0b35e45ee576..86a9a1b618de 100644 --- a/tests/unit/test_llm.py +++ b/tests/unit/test_llm.py @@ -463,46 +463,28 @@ def test_llm_token_usage(mock_litellm_completion, default_config): llm = LLM(config=default_config) - # First call: usage_1 - _ = llm.completion(messages=[{'role': 'user', 'content': 'Hello usage!'}]) + # First call + llm.completion(messages=[{'role': 'user', 'content': 'Hello usage!'}]) - # Check that the metrics tracked these tokens for the first response - assert llm.metrics.get()['accumulated_prompt_tokens'] == 12 - assert llm.metrics.get()['accumulated_completion_tokens'] == 3 - assert llm.metrics.get()['accumulated_cache_read_tokens'] == 2 - assert llm.metrics.get()['accumulated_cache_write_tokens'] == 5 - - # Also verify tokens_usages has a single entry with the exact usage + # Verify we have exactly one usage record after first call tokens_usage_list = llm.metrics.get()['tokens_usages'] assert len(tokens_usage_list) == 1 - usage_entry = tokens_usage_list[0] - assert usage_entry['prompt_tokens'] == 12 - assert usage_entry['completion_tokens'] == 3 - assert usage_entry['cache_read_tokens'] == 2 - assert usage_entry['cache_write_tokens'] == 5 - # Check the response_id - assert usage_entry['response_id'] == 'test-response-usage' - - # Second call: usage_2 - _ = llm.completion(messages=[{'role': 'user', 'content': 'Hello again!'}]) - - # Now check accumulated totals - metrics_dict = llm.metrics.get() - # Prompt tokens = 12 + 7 = 19 - assert metrics_dict['accumulated_prompt_tokens'] == 19 - # Completion tokens = 3 + 2 = 5 - assert metrics_dict['accumulated_completion_tokens'] == 5 - # Cache read = 2 + 1 = 3 - assert metrics_dict['accumulated_cache_read_tokens'] == 3 - # Cache write = 5 + 3 = 8 - assert metrics_dict['accumulated_cache_write_tokens'] == 8 - - # Also verify we have two usage records now - tokens_usage_list = metrics_dict['tokens_usages'] + usage_entry_1 = tokens_usage_list[0] + assert usage_entry_1['prompt_tokens'] == 12 + assert usage_entry_1['completion_tokens'] == 3 + assert usage_entry_1['cache_read_tokens'] == 2 + assert usage_entry_1['cache_write_tokens'] == 5 + assert usage_entry_1['response_id'] == 'test-response-usage' + + # Second call + llm.completion(messages=[{'role': 'user', 'content': 'Hello again!'}]) + + # Now we expect two usage records total + tokens_usage_list = llm.metrics.get()['tokens_usages'] assert len(tokens_usage_list) == 2 - latest_entry = tokens_usage_list[-1] - assert latest_entry['prompt_tokens'] == 7 - assert latest_entry['completion_tokens'] == 2 - assert latest_entry['cache_read_tokens'] == 1 - assert latest_entry['cache_write_tokens'] == 3 - assert latest_entry['response_id'] == 'test-response-usage-2' + usage_entry_2 = tokens_usage_list[-1] + assert usage_entry_2['prompt_tokens'] == 7 + assert usage_entry_2['completion_tokens'] == 2 + assert usage_entry_2['cache_read_tokens'] == 1 + assert usage_entry_2['cache_write_tokens'] == 3 + assert usage_entry_2['response_id'] == 'test-response-usage-2'