Skip to content

Commit

Permalink
Merge
Browse files Browse the repository at this point in the history
  • Loading branch information
SmartManoj committed Nov 5, 2024
2 parents c8988f2 + eeb2342 commit ebb2fe7
Show file tree
Hide file tree
Showing 47 changed files with 578 additions and 403 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ Pour créer un workflow d'évaluation pour votre benchmark, suivez ces étapes :
instruction=instruction,
test_result=evaluation_result,
metadata=metadata,
history=state.history.compatibility_for_eval_history_pairs(),
history=compatibility_for_eval_history_pairs(state.history),
metrics=state.metrics.get() if state.metrics else None,
error=state.last_error if state and state.last_error else None,
)
Expand Down Expand Up @@ -260,7 +260,7 @@ def codeact_user_response(state: State | None) -> str:
# vérifier si l'agent a essayé de parler à l'utilisateur 3 fois, si oui, faire savoir à l'agent qu'il peut abandonner
user_msgs = [
event
for event in state.history.get_events()
for event in state.history
if isinstance(event, MessageAction) and event.source == 'user'
]
if len(user_msgs) >= 2:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它工
instruction=instruction,
test_result=evaluation_result,
metadata=metadata,
history=state.history.compatibility_for_eval_history_pairs(),
history=compatibility_for_eval_history_pairs(state.history),
metrics=state.metrics.get() if state.metrics else None,
error=state.last_error if state and state.last_error else None,
)
Expand Down Expand Up @@ -257,7 +257,7 @@ def codeact_user_response(state: State | None) -> str:
# 检查代理是否已尝试与用户对话 3 次,如果是,让代理知道它可以放弃
user_msgs = [
event
for event in state.history.get_events()
for event in state.history
if isinstance(event, MessageAction) and event.source == 'user'
]
if len(user_msgs) >= 2:
Expand Down
4 changes: 2 additions & 2 deletions docs/modules/usage/how-to/evaluation-harness.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ To create an evaluation workflow for your benchmark, follow these steps:
instruction=instruction,
test_result=evaluation_result,
metadata=metadata,
history=state.history.compatibility_for_eval_history_pairs(),
history=compatibility_for_eval_history_pairs(state.history),
metrics=state.metrics.get() if state.metrics else None,
error=state.last_error if state and state.last_error else None,
)
Expand Down Expand Up @@ -257,7 +257,7 @@ def codeact_user_response(state: State | None) -> str:
# check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
user_msgs = [
event
for event in state.history.get_events()
for event in state.history
if isinstance(event, MessageAction) and event.source == 'user'
]
if len(user_msgs) >= 2:
Expand Down
7 changes: 4 additions & 3 deletions evaluation/EDA/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand All @@ -34,7 +35,7 @@ def codeact_user_response_eda(state: State) -> str:

# retrieve the latest model message from history
if state.history:
model_guess = state.history.get_last_agent_message()
model_guess = state.get_last_agent_message()

assert game is not None, 'Game is not initialized.'
msg = game.generate_user_response(model_guess)
Expand Down Expand Up @@ -139,7 +140,7 @@ def process_instance(
if state is None:
raise ValueError('State should not be None.')

final_message = state.history.get_last_agent_message()
final_message = state.get_last_agent_message()

logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
test_result = game.reward()
Expand All @@ -148,7 +149,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
5 changes: 3 additions & 2 deletions evaluation/agent_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -242,7 +243,7 @@ def process_instance(
raw_ans = ''

# retrieve the last agent message or thought
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if event.source == 'agent':
if isinstance(event, AgentFinishAction):
raw_ans = event.thought
Expand Down Expand Up @@ -271,7 +272,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

metrics = state.metrics.get() if state.metrics else None

Expand Down
3 changes: 2 additions & 1 deletion evaluation/aider_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -250,7 +251,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)
metrics = state.metrics.get() if state.metrics else None

# Save the output
Expand Down
3 changes: 2 additions & 1 deletion evaluation/biocoder/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -299,7 +300,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

test_result['generated'] = test_result['metadata']['1_copy_change_code']

Expand Down
5 changes: 3 additions & 2 deletions evaluation/bird/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -46,7 +47,7 @@ def codeact_user_response(state: State) -> str:
# check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
user_msgs = [
event
for event in state.history.get_events()
for event in state.history
if isinstance(event, MessageAction) and event.source == 'user'
]
if len(user_msgs) > 2:
Expand Down Expand Up @@ -431,7 +432,7 @@ def execute_sql(db_path, sql):
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
3 changes: 2 additions & 1 deletion evaluation/browsing_delegation/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -89,7 +90,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# find the last delegate action
last_delegate_action = None
Expand Down
7 changes: 4 additions & 3 deletions evaluation/discoverybench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -173,14 +174,14 @@ def initialize_runtime(runtime: Runtime, data_files: list[str]):


def get_last_agent_finish_action(state: State) -> AgentFinishAction:
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if isinstance(event, AgentFinishAction):
return event
return None


def get_last_message_action(state: State) -> MessageAction:
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if isinstance(event, MessageAction):
return event
return None
Expand Down Expand Up @@ -307,7 +308,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# DiscoveryBench Evaluation
eval_rec = run_eval_gold_vs_gen_NL_hypo_workflow(
Expand Down
5 changes: 3 additions & 2 deletions evaluation/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -166,7 +167,7 @@ def process_instance(

model_answer_raw = ''
# get the last message or thought from the agent
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if event.source == 'agent':
if isinstance(event, AgentFinishAction):
model_answer_raw = event.thought
Expand Down Expand Up @@ -203,7 +204,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
5 changes: 3 additions & 2 deletions evaluation/gorilla/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -101,7 +102,7 @@ def process_instance(
raise ValueError('State should not be None.')

# retrieve the last message from the agent
model_answer_raw = state.history.get_last_agent_message()
model_answer_raw = state.get_last_agent_message()

# attempt to parse model_answer
ast_eval_fn = instance['ast_eval']
Expand All @@ -114,7 +115,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

output = EvalOutput(
instance_id=instance_id,
Expand Down
5 changes: 3 additions & 2 deletions evaluation/gpqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -244,7 +245,7 @@ def process_instance(
'C': False,
'D': False,
}
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if (
isinstance(event, AgentFinishAction)
and event.source != 'user'
Expand Down Expand Up @@ -300,7 +301,7 @@ def process_instance(
instance_id=str(instance.instance_id),
instruction=instruction,
metadata=metadata,
history=state.history.compatibility_for_eval_history_pairs(),
history=compatibility_for_eval_history_pairs(state.history),
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
Expand Down
3 changes: 2 additions & 1 deletion evaluation/humanevalfix/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -255,7 +256,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
2 changes: 1 addition & 1 deletion evaluation/integration_tests/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def process_instance(
# # result evaluation
# # =============================================

histories = [event_to_dict(event) for event in state.history.get_events()]
histories = [event_to_dict(event) for event in state.history]
test_result: TestResult = test_class.verify_result(runtime, histories)
metrics = state.metrics.get() if state.metrics else None

Expand Down
5 changes: 3 additions & 2 deletions evaluation/logic_reasoning/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -225,7 +226,7 @@ def process_instance(
raise ValueError('State should not be None.')

final_message = ''
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if isinstance(event, AgentFinishAction):
final_message = event.thought
break
Expand All @@ -247,7 +248,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
5 changes: 3 additions & 2 deletions evaluation/miniwob/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -182,7 +183,7 @@ def process_instance(

# Instruction is the first message from the USER
instruction = ''
for event in state.history.get_events():
for event in state.history:
if isinstance(event, MessageAction):
instruction = event.content
break
Expand All @@ -194,7 +195,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
Loading

0 comments on commit ebb2fe7

Please sign in to comment.