diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index b15177ce7540..7bd37dc76f79 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -497,6 +497,7 @@ def _post_completion(self, response: ModelResponse) -> float: stats += 'Response Latency: %.3f seconds\n' % latest_latency.latency usage: Usage | None = response.get('usage') + response_id = response.get('id', 'unknown') if usage: # keep track of the input and output tokens @@ -539,6 +540,7 @@ def _post_completion(self, response: ModelResponse) -> float: completion_tokens=completion_tokens, cache_read_tokens=cache_hit_tokens, cache_write_tokens=cache_write_tokens, + response_id=response_id, ) # log the stats diff --git a/openhands/llm/metrics.py b/openhands/llm/metrics.py index a927580fbe60..e4f52b1d3f31 100644 --- a/openhands/llm/metrics.py +++ b/openhands/llm/metrics.py @@ -25,7 +25,7 @@ class TokensUsage(BaseModel): completion_tokens: int cache_read_tokens: int cache_write_tokens: int - timestamp: float = Field(default_factory=time.time) + response_id: str class Metrics: @@ -90,6 +90,7 @@ def add_tokens_usage( completion_tokens: int, cache_read_tokens: int, cache_write_tokens: int, + response_id: str, ) -> None: # accumulate self._accumulated_prompt_tokens += prompt_tokens @@ -105,6 +106,7 @@ def add_tokens_usage( completion_tokens=completion_tokens, cache_read_tokens=cache_read_tokens, cache_write_tokens=cache_write_tokens, + response_id=response_id, ) ) diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py index 1bfee8550698..f94e98676ff4 100644 --- a/tests/unit/test_llm.py +++ b/tests/unit/test_llm.py @@ -429,3 +429,38 @@ def test_get_token_count_error_handling( mock_logger.error.assert_called_once_with( 'Error getting token count for\n model gpt-4o\nToken counting failed' ) + + +@patch('openhands.llm.llm.litellm_completion') +def test_llm_token_usage(mock_litellm_completion, default_config): + # This mock response includes usage details with prompt_tokens, + # completion_tokens, prompt_tokens_details.cached_tokens, and model_extra.cache_creation_input_tokens + mock_response = { + 'id': 'test-response-usage', + 'choices': [{'message': {'content': 'Usage test response'}}], + 'usage': { + 'prompt_tokens': 12, + 'completion_tokens': 3, + 'prompt_tokens_details': {'cached_tokens': 2}, + 'model_extra': {'cache_creation_input_tokens': 5}, + }, + } + mock_litellm_completion.return_value = mock_response + + llm = LLM(config=default_config) + _ = llm.completion(messages=[{'role': 'user', 'content': 'Hello usage!'}]) + + # Check that the metrics tracked these tokens + assert llm.metrics.get()['accumulated_prompt_tokens'] == 12 + assert llm.metrics.get()['accumulated_completion_tokens'] == 3 + assert llm.metrics.get()['accumulated_cache_read_tokens'] == 2 + assert llm.metrics.get()['accumulated_cache_write_tokens'] == 5 + + # Also verify tokens_usages has a single entry with the exact usage + tokens_usage_list = llm.metrics.get()['tokens_usages'] + assert len(tokens_usage_list) == 1 + usage_entry = tokens_usage_list[0] + assert usage_entry['prompt_tokens'] == 12 + assert usage_entry['completion_tokens'] == 3 + assert usage_entry['cache_read_tokens'] == 2 + assert usage_entry['cache_write_tokens'] == 5