track used tokens

All-Hands-AI · Feb 22, 2025 · d80c376 · d80c376
1 parent 70b21d1
commit d80c376
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 9 deletions.
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
@@ -500,17 +500,17 @@ def _post_completion(self, response: ModelResponse) -> float:
 
         if usage:
             # keep track of the input and output tokens
-            input_tokens = usage.get('prompt_tokens')
-            output_tokens = usage.get('completion_tokens')
+            prompt_tokens = usage.get('prompt_tokens', 0)
+            completion_tokens = usage.get('completion_tokens', 0)
 
-            if input_tokens:
-                stats += 'Input tokens: ' + str(input_tokens)
+            if prompt_tokens:
+                stats += 'Input tokens: ' + str(prompt_tokens)
 
-            if output_tokens:
+            if completion_tokens:
                 stats += (
-                    (' | ' if input_tokens else '')
+                    (' | ' if prompt_tokens else '')
                     + 'Output tokens: '
-                    + str(output_tokens)
+                    + str(completion_tokens)
                     + '\n'
                 )
 
@@ -519,7 +519,7 @@ def _post_completion(self, response: ModelResponse) -> float:
                 'prompt_tokens_details'
             )
             cache_hit_tokens = (
-                prompt_tokens_details.cached_tokens if prompt_tokens_details else None
+                prompt_tokens_details.cached_tokens if prompt_tokens_details else 0
             )
             if cache_hit_tokens:
                 stats += 'Input tokens (cache hit): ' + str(cache_hit_tokens) + '\n'
@@ -528,10 +528,19 @@ def _post_completion(self, response: ModelResponse) -> float:
             # but litellm doesn't separate them in the usage stats
             # so we can read it from the provider-specific extra field
             model_extra = usage.get('model_extra', {})
-            cache_write_tokens = model_extra.get('cache_creation_input_tokens')
+            cache_write_tokens = model_extra.get('cache_creation_input_tokens', 0)
             if cache_write_tokens:
                 stats += 'Input tokens (cache write): ' + str(cache_write_tokens) + '\n'
 
+            # Record in metrics
+            # We'll treat cache_hit_tokens as "cache read" and cache_write_tokens as "cache write"
+            self.metrics.add_tokens_usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                cache_read_tokens=cache_hit_tokens,
+                cache_write_tokens=cache_write_tokens,
+            )
+
         # log the stats
         if stats:
             logger.debug(stats)

diff --git a/openhands/llm/metrics.py b/openhands/llm/metrics.py
@@ -17,18 +17,35 @@ class ResponseLatency(BaseModel):
     response_id: str
 
 
+class TokensUsage(BaseModel):
+    """Metric tracking detailed token usage per completion call."""
+
+    model: str
+    prompt_tokens: int
+    completion_tokens: int
+    cache_read_tokens: int
+    cache_write_tokens: int
+    timestamp: float = Field(default_factory=time.time)
+
+
 class Metrics:
     """Metrics class can record various metrics during running and evaluation.
     Currently, we define the following metrics:
         accumulated_cost: the total cost (USD $) of the current LLM.
         response_latency: the time taken for each LLM completion call.
+        accrued token usage: the total tokens used across all completions.
     """
 
     def __init__(self, model_name: str = 'default') -> None:
         self._accumulated_cost: float = 0.0
         self._costs: list[Cost] = []
         self._response_latencies: list[ResponseLatency] = []
         self.model_name = model_name
+        self._accumulated_prompt_tokens = 0
+        self._accumulated_completion_tokens = 0
+        self._accumulated_cache_read_tokens = 0
+        self._accumulated_cache_write_tokens = 0
+        self._tokens_usages: list[TokensUsage] = []
 
     @property
     def accumulated_cost(self) -> float:
@@ -67,16 +84,50 @@ def add_response_latency(self, value: float, response_id: str) -> None:
             )
         )
 
+    def add_tokens_usage(
+        self,
+        prompt_tokens: int,
+        completion_tokens: int,
+        cache_read_tokens: int,
+        cache_write_tokens: int,
+    ) -> None:
+        # accumulate
+        self._accumulated_prompt_tokens += prompt_tokens
+        self._accumulated_completion_tokens += completion_tokens
+        self._accumulated_cache_read_tokens += cache_read_tokens
+        self._accumulated_cache_write_tokens += cache_write_tokens
+
+        # record this individual usage
+        self._tokens_usages.append(
+            TokensUsage(
+                model=self.model_name,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                cache_read_tokens=cache_read_tokens,
+                cache_write_tokens=cache_write_tokens,
+            )
+        )
+
     def merge(self, other: 'Metrics') -> None:
         self._accumulated_cost += other.accumulated_cost
         self._costs += other._costs
         self._response_latencies += other._response_latencies
+        self._accumulated_prompt_tokens += other._accumulated_prompt_tokens
+        self._accumulated_completion_tokens += other._accumulated_completion_tokens
+        self._accumulated_cache_read_tokens += other._accumulated_cache_read_tokens
+        self._accumulated_cache_write_tokens += other._accumulated_cache_write_tokens
+        self._tokens_usages += other._tokens_usages
 
     def get(self) -> dict:
         """Return the metrics in a dictionary."""
         return {
             'accumulated_cost': self._accumulated_cost,
             'costs': [cost.model_dump() for cost in self._costs],
+            'accumulated_prompt_tokens': self._accumulated_prompt_tokens,
+            'accumulated_completion_tokens': self._accumulated_completion_tokens,
+            'accumulated_cache_read_tokens': self._accumulated_cache_read_tokens,
+            'accumulated_cache_write_tokens': self._accumulated_cache_write_tokens,
+            'tokens_usages': [usage.model_dump() for usage in self._tokens_usages],
             'response_latencies': [
                 latency.model_dump() for latency in self._response_latencies
             ],
@@ -86,6 +137,11 @@ def reset(self):
         self._accumulated_cost = 0.0
         self._costs = []
         self._response_latencies = []
+        self._accumulated_prompt_tokens = 0
+        self._accumulated_completion_tokens = 0
+        self._accumulated_cache_read_tokens = 0
+        self._accumulated_cache_write_tokens = 0
+        self._tokens_usages = []
 
     def log(self):
         """Log the metrics."""