Skip to content

Commit

Permalink
Merge branch 'main' into feature/strict-mypy-checks
Browse files Browse the repository at this point in the history
  • Loading branch information
neubig authored Feb 24, 2025
2 parents 18d8aa9 + 2d2dbf1 commit b147f6a
Show file tree
Hide file tree
Showing 9 changed files with 352 additions and 31 deletions.
14 changes: 11 additions & 3 deletions openhands/core/config/condenser_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ class RecentEventsCondenserConfig(BaseModel):
"""Configuration for RecentEventsCondenser."""

type: Literal['recent'] = Field('recent')

# at least one event by default, because the best guess is that it is the user task
keep_first: int = Field(
default=0,
default=1,
description='The number of initial events to condense.',
ge=0,
)
Expand All @@ -43,6 +45,8 @@ class LLMSummarizingCondenserConfig(BaseModel):
llm_config: LLMConfig = Field(
..., description='Configuration for the LLM to use for condensing.'
)

# at least one event by default, because the best guess is that it's the user task
keep_first: int = Field(
default=1,
description='The number of initial events to condense.',
Expand All @@ -62,8 +66,10 @@ class AmortizedForgettingCondenserConfig(BaseModel):
description='Maximum size of the condensed history before triggering forgetting.',
ge=2,
)

# at least one event by default, because the best guess is that it's the user task
keep_first: int = Field(
default=0,
default=1,
description='Number of initial events to always keep in history.',
ge=0,
)
Expand All @@ -81,8 +87,10 @@ class LLMAttentionCondenserConfig(BaseModel):
description='Maximum size of the condensed history before triggering forgetting.',
ge=2,
)

# at least one event by default, because the best guess is that it's the user task
keep_first: int = Field(
default=0,
default=1,
description='Number of initial events to always keep in history.',
ge=0,
)
Expand Down
45 changes: 45 additions & 0 deletions openhands/core/message_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from openhands.events.observation.error import ErrorObservation
from openhands.events.observation.observation import Observation
from openhands.events.serialization.event import truncate_content
from openhands.llm.metrics import Metrics, TokenUsage


def events_to_messages(
Expand Down Expand Up @@ -362,3 +363,47 @@ def apply_prompt_caching(messages: list[Message]) -> None:
-1
].cache_prompt = True # Last item inside the message content
break


def get_token_usage_for_event(event: Event, metrics: Metrics) -> TokenUsage | None:
"""
Returns at most one token usage record for the `model_response.id` in this event's
`tool_call_metadata`.
If no response_id is found, or none match in metrics.token_usages, returns None.
"""
if event.tool_call_metadata and event.tool_call_metadata.model_response:
response_id = event.tool_call_metadata.model_response.get('id')
if response_id:
return next(
(
usage
for usage in metrics.token_usages
if usage.response_id == response_id
),
None,
)
return None


def get_token_usage_for_event_id(
events: list[Event], event_id: int, metrics: Metrics
) -> TokenUsage | None:
"""
Starting from the event with .id == event_id and moving backwards in `events`,
find the first TokenUsage record (if any) associated with a response_id from
tool_call_metadata.model_response.id.
Returns the first match found, or None if none is found.
"""
# find the index of the event with the given id
idx = next((i for i, e in enumerate(events) if e.id == event_id), None)
if idx is None:
return None

# search backward from idx down to 0
for i in range(idx, -1, -1):
usage = get_token_usage_for_event(events[i], metrics)
if usage is not None:
return usage
return None
29 changes: 20 additions & 9 deletions openhands/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,20 +497,21 @@ def _post_completion(self, response: ModelResponse) -> float:
stats += 'Response Latency: %.3f seconds\n' % latest_latency.latency

usage: Usage | None = response.get('usage')
response_id = response.get('id', 'unknown')

if usage:
# keep track of the input and output tokens
input_tokens = usage.get('prompt_tokens')
output_tokens = usage.get('completion_tokens')
prompt_tokens = usage.get('prompt_tokens', 0)
completion_tokens = usage.get('completion_tokens', 0)

if input_tokens:
stats += 'Input tokens: ' + str(input_tokens)
if prompt_tokens:
stats += 'Input tokens: ' + str(prompt_tokens)

if output_tokens:
if completion_tokens:
stats += (
(' | ' if input_tokens else '')
(' | ' if prompt_tokens else '')
+ 'Output tokens: '
+ str(output_tokens)
+ str(completion_tokens)
+ '\n'
)

Expand All @@ -519,7 +520,7 @@ def _post_completion(self, response: ModelResponse) -> float:
'prompt_tokens_details'
)
cache_hit_tokens = (
prompt_tokens_details.cached_tokens if prompt_tokens_details else None
prompt_tokens_details.cached_tokens if prompt_tokens_details else 0
)
if cache_hit_tokens:
stats += 'Input tokens (cache hit): ' + str(cache_hit_tokens) + '\n'
Expand All @@ -528,10 +529,20 @@ def _post_completion(self, response: ModelResponse) -> float:
# but litellm doesn't separate them in the usage stats
# so we can read it from the provider-specific extra field
model_extra = usage.get('model_extra', {})
cache_write_tokens = model_extra.get('cache_creation_input_tokens')
cache_write_tokens = model_extra.get('cache_creation_input_tokens', 0)
if cache_write_tokens:
stats += 'Input tokens (cache write): ' + str(cache_write_tokens) + '\n'

# Record in metrics
# We'll treat cache_hit_tokens as "cache read" and cache_write_tokens as "cache write"
self.metrics.add_token_usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
cache_read_tokens=cache_hit_tokens,
cache_write_tokens=cache_write_tokens,
response_id=response_id,
)

# log the stats
if stats:
logger.debug(stats)
Expand Down
56 changes: 52 additions & 4 deletions openhands/llm/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,31 @@ class ResponseLatency(BaseModel):
response_id: str


class TokenUsage(BaseModel):
"""Metric tracking detailed token usage per completion call."""

model: str
prompt_tokens: int
completion_tokens: int
cache_read_tokens: int
cache_write_tokens: int
response_id: str


class Metrics:
"""Metrics class can record various metrics during running and evaluation.
Currently, we define the following metrics:
accumulated_cost: the total cost (USD $) of the current LLM.
response_latency: the time taken for each LLM completion call.
We track:
- accumulated_cost and costs
- A list of ResponseLatency
- A list of TokenUsage (one per call).
"""

def __init__(self, model_name: str = 'default') -> None:
self._accumulated_cost: float = 0.0
self._costs: list[Cost] = []
self._response_latencies: list[ResponseLatency] = []
self.model_name = model_name
self._token_usages: list[TokenUsage] = []

@property
def accumulated_cost(self) -> float:
Expand All @@ -54,6 +67,16 @@ def response_latencies(self) -> list[ResponseLatency]:
def response_latencies(self, value: list[ResponseLatency]) -> None:
self._response_latencies = value

@property
def token_usages(self) -> list[TokenUsage]:
if not hasattr(self, '_token_usages'):
self._token_usages = []
return self._token_usages

@token_usages.setter
def token_usages(self, value: list[TokenUsage]) -> None:
self._token_usages = value

def add_cost(self, value: float) -> None:
if value < 0:
raise ValueError('Added cost cannot be negative.')
Expand All @@ -67,10 +90,33 @@ def add_response_latency(self, value: float, response_id: str) -> None:
)
)

def add_token_usage(
self,
prompt_tokens: int,
completion_tokens: int,
cache_read_tokens: int,
cache_write_tokens: int,
response_id: str,
) -> None:
"""Add a single usage record."""
self._token_usages.append(
TokenUsage(
model=self.model_name,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
cache_read_tokens=cache_read_tokens,
cache_write_tokens=cache_write_tokens,
response_id=response_id,
)
)

def merge(self, other: 'Metrics') -> None:
"""Merge 'other' metrics into this one."""
self._accumulated_cost += other.accumulated_cost
self._costs += other._costs
self._response_latencies += other._response_latencies
# use the property so older picked objects that lack the field won't crash
self.token_usages += other.token_usages
self.response_latencies += other.response_latencies

def get(self) -> dict:
"""Return the metrics in a dictionary."""
Expand All @@ -80,12 +126,14 @@ def get(self) -> dict:
'response_latencies': [
latency.model_dump() for latency in self._response_latencies
],
'token_usages': [usage.model_dump() for usage in self._token_usages],
}

def reset(self):
self._accumulated_cost = 0.0
self._costs = []
self._response_latencies = []
self._token_usages = []

def log(self):
"""Log the metrics."""
Expand Down
2 changes: 1 addition & 1 deletion openhands/memory/condenser/impl/llm_attention_condenser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class ImportantEventSelection(BaseModel):
class LLMAttentionCondenser(RollingCondenser):
"""Rolling condenser strategy that uses an LLM to select the most important events when condensing the history."""

def __init__(self, llm: LLM, max_size: int = 100, keep_first: int = 0):
def __init__(self, llm: LLM, max_size: int = 100, keep_first: int = 1):
if keep_first >= max_size // 2:
raise ValueError(
f'keep_first ({keep_first}) must be less than half of max_size ({max_size})'
Expand Down
2 changes: 1 addition & 1 deletion openhands/memory/condenser/impl/recent_events_condenser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class RecentEventsCondenser(Condenser):
"""A condenser that only keeps a certain number of the most recent events."""

def __init__(self, keep_first: int = 0, max_events: int = 10):
def __init__(self, keep_first: int = 1, max_events: int = 10):
self.keep_first = keep_first
self.max_events = max_events

Expand Down
55 changes: 44 additions & 11 deletions tests/unit/test_condenser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def create_test_event(
event = Event()
event._message = message
event.timestamp = timestamp if timestamp else datetime.now()
if id:
if id is not None:
event._id = id
event._source = EventSource.USER
return event
Expand Down Expand Up @@ -186,13 +186,14 @@ def test_recent_events_condenser():
assert result == events

# If the max_events are smaller than the number of events, only keep the last few.
max_events = 2
max_events = 3
condenser = RecentEventsCondenser(max_events=max_events)
result = condenser.condensed_history(mock_state)

assert len(result) == max_events
assert result[0]._message == 'Event 4'
assert result[1]._message == 'Event 5'
assert result[0]._message == 'Event 1' # kept from keep_first
assert result[1]._message == 'Event 4' # kept from max_events
assert result[2]._message == 'Event 5' # kept from max_events

# If the keep_first flag is set, the first event will always be present.
keep_first = 1
Expand All @@ -211,9 +212,9 @@ def test_recent_events_condenser():
result = condenser.condensed_history(mock_state)

assert len(result) == max_events
assert result[0]._message == 'Event 1'
assert result[1]._message == 'Event 2'
assert result[2]._message == 'Event 5'
assert result[0]._message == 'Event 1' # kept from keep_first
assert result[1]._message == 'Event 2' # kept from keep_first
assert result[2]._message == 'Event 5' # kept from max_events


def test_llm_summarization_condenser_from_config():
Expand Down Expand Up @@ -539,7 +540,7 @@ def test_llm_attention_condenser_forgets_when_larger_than_max_size(
):
"""Test that the LLMAttentionCondenser forgets events when the context grows too large."""
max_size = 2
condenser = LLMAttentionCondenser(max_size=max_size, llm=mock_llm)
condenser = LLMAttentionCondenser(max_size=max_size, keep_first=0, llm=mock_llm)

for i in range(max_size * 10):
event = create_test_event(f'Event {i}', id=i)
Expand All @@ -560,7 +561,7 @@ def test_llm_attention_condenser_forgets_when_larger_than_max_size(
def test_llm_attention_condenser_handles_events_outside_history(mock_llm, mock_state):
"""Test that the LLMAttentionCondenser handles event IDs that aren't from the event history."""
max_size = 2
condenser = LLMAttentionCondenser(max_size=max_size, llm=mock_llm)
condenser = LLMAttentionCondenser(max_size=max_size, keep_first=0, llm=mock_llm)

for i in range(max_size * 10):
event = create_test_event(f'Event {i}', id=i)
Expand All @@ -580,7 +581,7 @@ def test_llm_attention_condenser_handles_events_outside_history(mock_llm, mock_s
def test_llm_attention_condenser_handles_too_many_events(mock_llm, mock_state):
"""Test that the LLMAttentionCondenser handles when the response contains too many event IDs."""
max_size = 2
condenser = LLMAttentionCondenser(max_size=max_size, llm=mock_llm)
condenser = LLMAttentionCondenser(max_size=max_size, keep_first=0, llm=mock_llm)

for i in range(max_size * 10):
event = create_test_event(f'Event {i}', id=i)
Expand All @@ -600,7 +601,9 @@ def test_llm_attention_condenser_handles_too_many_events(mock_llm, mock_state):
def test_llm_attention_condenser_handles_too_few_events(mock_llm, mock_state):
"""Test that the LLMAttentionCondenser handles when the response contains too few event IDs."""
max_size = 2
condenser = LLMAttentionCondenser(max_size=max_size, llm=mock_llm)
# Developer note: We must specify keep_first=0 because
# keep_first (1) >= max_size//2 (1) is invalid.
condenser = LLMAttentionCondenser(max_size=max_size, keep_first=0, llm=mock_llm)

for i in range(max_size * 10):
event = create_test_event(f'Event {i}', id=i)
Expand All @@ -614,3 +617,33 @@ def test_llm_attention_condenser_handles_too_few_events(mock_llm, mock_state):

# The number of results should bounce back and forth between 1, 2, 1, 2, ...
assert len(results) == (i % 2) + 1

# Add a new test verifying that keep_first=1 works with max_size > 2


def test_llm_attention_condenser_handles_keep_first_for_larger_max_size(
mock_llm, mock_state
):
"""Test that LLMAttentionCondenser works when keep_first=1 is allowed (must be less than half of max_size)."""
max_size = 4 # so keep_first=1 < (max_size // 2) = 2
condenser = LLMAttentionCondenser(max_size=max_size, keep_first=1, llm=mock_llm)

for i in range(max_size * 2):
# We append new events, then ensure some are pruned.
event = create_test_event(f'Event {i}', id=i)
mock_state.history.append(event)

mock_llm.set_mock_response_content(
ImportantEventSelection(ids=[]).model_dump_json()
)

results = condenser.condensed_history(mock_state)

# We expect that the first event is always kept, and the tail grows until max_size
if len(mock_state.history) <= max_size:
# No condensation needed yet
assert len(results) == len(mock_state.history)
else:
# The first event is kept, plus some from the tail
assert results[0].id == 0
assert len(results) <= max_size
Loading

0 comments on commit b147f6a

Please sign in to comment.